corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -4,10 +4,9 @@ Plugins module for the extraction pipeline.
|
|
|
4
4
|
Contains all plugin implementations organized by stage:
|
|
5
5
|
- splitters/: Stage 1 - Text to atomic triples
|
|
6
6
|
- extractors/: Stage 2 - Refine entities and relations
|
|
7
|
-
- qualifiers/: Stage 3 -
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
- taxonomy/: Stage 6 - Taxonomy classification
|
|
7
|
+
- qualifiers/: Stage 3 - Qualify entities (add identifiers, canonical names, FQN)
|
|
8
|
+
- labelers/: Stage 4 - Classify statements
|
|
9
|
+
- taxonomy/: Stage 5 - Taxonomy classification
|
|
11
10
|
"""
|
|
12
11
|
|
|
13
12
|
from .base import (
|
|
@@ -16,13 +15,20 @@ from .base import (
|
|
|
16
15
|
BaseSplitterPlugin,
|
|
17
16
|
BaseExtractorPlugin,
|
|
18
17
|
BaseQualifierPlugin,
|
|
19
|
-
BaseCanonicalizerPlugin,
|
|
20
18
|
BaseLabelerPlugin,
|
|
21
19
|
BaseTaxonomyPlugin,
|
|
20
|
+
# Content acquisition plugins
|
|
21
|
+
ContentType,
|
|
22
|
+
ScraperResult,
|
|
23
|
+
PDFParseResult,
|
|
24
|
+
BaseScraperPlugin,
|
|
25
|
+
BasePDFParserPlugin,
|
|
22
26
|
)
|
|
23
27
|
|
|
24
28
|
# Import plugin modules for auto-registration
|
|
25
|
-
from . import splitters, extractors, qualifiers,
|
|
29
|
+
from . import splitters, extractors, qualifiers, labelers, taxonomy
|
|
30
|
+
# Content acquisition plugins
|
|
31
|
+
from . import scrapers, pdf
|
|
26
32
|
|
|
27
33
|
__all__ = [
|
|
28
34
|
"PluginCapability",
|
|
@@ -30,14 +36,20 @@ __all__ = [
|
|
|
30
36
|
"BaseSplitterPlugin",
|
|
31
37
|
"BaseExtractorPlugin",
|
|
32
38
|
"BaseQualifierPlugin",
|
|
33
|
-
"BaseCanonicalizerPlugin",
|
|
34
39
|
"BaseLabelerPlugin",
|
|
35
40
|
"BaseTaxonomyPlugin",
|
|
41
|
+
# Content acquisition plugins
|
|
42
|
+
"ContentType",
|
|
43
|
+
"ScraperResult",
|
|
44
|
+
"PDFParseResult",
|
|
45
|
+
"BaseScraperPlugin",
|
|
46
|
+
"BasePDFParserPlugin",
|
|
36
47
|
# Plugin modules
|
|
37
48
|
"splitters",
|
|
38
49
|
"extractors",
|
|
39
50
|
"qualifiers",
|
|
40
|
-
"canonicalizers",
|
|
41
51
|
"labelers",
|
|
42
52
|
"taxonomy",
|
|
53
|
+
"scrapers",
|
|
54
|
+
"pdf",
|
|
43
55
|
]
|
|
@@ -4,15 +4,20 @@ Base plugin classes for the extraction pipeline.
|
|
|
4
4
|
Defines the abstract interfaces for each pipeline stage:
|
|
5
5
|
- BaseSplitterPlugin: Stage 1 - Text → RawTriple
|
|
6
6
|
- BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
|
|
7
|
-
- BaseQualifierPlugin: Stage 3 - Entity →
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
|
|
7
|
+
- BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
|
|
8
|
+
- BaseLabelerPlugin: Stage 4 - Statement → StatementLabel
|
|
9
|
+
- BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
|
|
10
|
+
|
|
11
|
+
Content acquisition plugins (for URL processing):
|
|
12
|
+
- BaseScraperPlugin: Fetch content from URLs
|
|
13
|
+
- BasePDFParserPlugin: Extract text from PDFs
|
|
11
14
|
"""
|
|
12
15
|
|
|
13
16
|
from abc import ABC, abstractmethod
|
|
14
|
-
from enum import Flag, auto
|
|
15
|
-
from typing import TYPE_CHECKING
|
|
17
|
+
from enum import Enum, Flag, auto
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
19
|
+
|
|
20
|
+
from pydantic import BaseModel, Field
|
|
16
21
|
|
|
17
22
|
if TYPE_CHECKING:
|
|
18
23
|
from ..pipeline.context import PipelineContext
|
|
@@ -20,9 +25,6 @@ if TYPE_CHECKING:
|
|
|
20
25
|
RawTriple,
|
|
21
26
|
PipelineStatement,
|
|
22
27
|
ExtractedEntity,
|
|
23
|
-
EntityQualifiers,
|
|
24
|
-
QualifiedEntity,
|
|
25
|
-
CanonicalMatch,
|
|
26
28
|
CanonicalEntity,
|
|
27
29
|
StatementLabel,
|
|
28
30
|
TaxonomyResult,
|
|
@@ -40,6 +42,56 @@ class PluginCapability(Flag):
|
|
|
40
42
|
CACHING = auto() # Supports result caching
|
|
41
43
|
|
|
42
44
|
|
|
45
|
+
def get_available_vram_gb() -> float:
|
|
46
|
+
"""
|
|
47
|
+
Get available GPU VRAM in GB.
|
|
48
|
+
|
|
49
|
+
Returns 0.0 if no GPU is available or VRAM cannot be determined.
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
import torch
|
|
53
|
+
if torch.cuda.is_available():
|
|
54
|
+
device = torch.cuda.current_device()
|
|
55
|
+
total = torch.cuda.get_device_properties(device).total_memory
|
|
56
|
+
allocated = torch.cuda.memory_allocated(device)
|
|
57
|
+
available = (total - allocated) / (1024 ** 3) # Convert to GB
|
|
58
|
+
return available
|
|
59
|
+
elif torch.backends.mps.is_available():
|
|
60
|
+
# MPS doesn't expose VRAM info; estimate based on typical Apple Silicon
|
|
61
|
+
# Return a conservative estimate
|
|
62
|
+
return 8.0
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
return 0.0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def calculate_batch_size(
|
|
69
|
+
per_item_vram_gb: float,
|
|
70
|
+
overhead_gb: float = 2.0,
|
|
71
|
+
min_batch: int = 1,
|
|
72
|
+
max_batch: int = 32,
|
|
73
|
+
) -> int:
|
|
74
|
+
"""
|
|
75
|
+
Calculate optimal batch size based on available VRAM.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
per_item_vram_gb: VRAM required per item in GB
|
|
79
|
+
overhead_gb: Reserved VRAM for model weights and system overhead
|
|
80
|
+
min_batch: Minimum batch size
|
|
81
|
+
max_batch: Maximum batch size cap
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Optimal batch size for the current GPU
|
|
85
|
+
"""
|
|
86
|
+
available = get_available_vram_gb()
|
|
87
|
+
if available <= 0 or per_item_vram_gb <= 0:
|
|
88
|
+
return min_batch
|
|
89
|
+
|
|
90
|
+
usable = max(0, available - overhead_gb)
|
|
91
|
+
batch_size = int(usable / per_item_vram_gb)
|
|
92
|
+
return max(min_batch, min(batch_size, max_batch))
|
|
93
|
+
|
|
94
|
+
|
|
43
95
|
class BasePlugin(ABC):
|
|
44
96
|
"""
|
|
45
97
|
Base class for all pipeline plugins.
|
|
@@ -74,6 +126,50 @@ class BasePlugin(ABC):
|
|
|
74
126
|
"""Human-readable description of this plugin."""
|
|
75
127
|
return ""
|
|
76
128
|
|
|
129
|
+
@property
|
|
130
|
+
def model_vram_gb(self) -> float:
|
|
131
|
+
"""
|
|
132
|
+
Estimated VRAM required for model weights in GB.
|
|
133
|
+
|
|
134
|
+
Override this if the plugin loads a GPU model. This is used to
|
|
135
|
+
reserve memory overhead when calculating batch sizes.
|
|
136
|
+
|
|
137
|
+
Default is 0.0 (no GPU model).
|
|
138
|
+
"""
|
|
139
|
+
return 0.0
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def per_item_vram_gb(self) -> float:
|
|
143
|
+
"""
|
|
144
|
+
Estimated VRAM required per item during batch processing in GB.
|
|
145
|
+
|
|
146
|
+
Override this for plugins with BATCH_PROCESSING capability.
|
|
147
|
+
Used to calculate optimal batch size: batch = (available - overhead) / per_item
|
|
148
|
+
|
|
149
|
+
Default is 0.1 GB (100MB) as a conservative estimate.
|
|
150
|
+
"""
|
|
151
|
+
return 0.1
|
|
152
|
+
|
|
153
|
+
def get_optimal_batch_size(self, max_batch: int = 32) -> int:
|
|
154
|
+
"""
|
|
155
|
+
Calculate optimal batch size based on available VRAM and plugin requirements.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
max_batch: Maximum batch size cap
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Optimal batch size for current GPU state
|
|
162
|
+
"""
|
|
163
|
+
if not (PluginCapability.BATCH_PROCESSING in self.capabilities):
|
|
164
|
+
return 1
|
|
165
|
+
|
|
166
|
+
return calculate_batch_size(
|
|
167
|
+
per_item_vram_gb=self.per_item_vram_gb,
|
|
168
|
+
overhead_gb=self.model_vram_gb + 1.0, # Add 1GB system overhead
|
|
169
|
+
min_batch=1,
|
|
170
|
+
max_batch=max_batch,
|
|
171
|
+
)
|
|
172
|
+
|
|
77
173
|
|
|
78
174
|
class BaseSplitterPlugin(BasePlugin):
|
|
79
175
|
"""
|
|
@@ -101,6 +197,27 @@ class BaseSplitterPlugin(BasePlugin):
|
|
|
101
197
|
"""
|
|
102
198
|
...
|
|
103
199
|
|
|
200
|
+
def split_batch(
|
|
201
|
+
self,
|
|
202
|
+
texts: list[str],
|
|
203
|
+
context: "PipelineContext",
|
|
204
|
+
) -> list[list["RawTriple"]]:
|
|
205
|
+
"""
|
|
206
|
+
Split multiple texts into atomic triples in a single batch.
|
|
207
|
+
|
|
208
|
+
Default implementation calls split() for each text sequentially.
|
|
209
|
+
Plugins with BATCH_PROCESSING capability should override this
|
|
210
|
+
for efficient GPU batching using get_optimal_batch_size().
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
texts: List of input texts to split
|
|
214
|
+
context: Pipeline context for accessing metadata and config
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of RawTriple lists, one per input text
|
|
218
|
+
"""
|
|
219
|
+
return [self.split(text, context) for text in texts]
|
|
220
|
+
|
|
104
221
|
|
|
105
222
|
class BaseExtractorPlugin(BasePlugin):
|
|
106
223
|
"""
|
|
@@ -132,10 +249,14 @@ class BaseExtractorPlugin(BasePlugin):
|
|
|
132
249
|
|
|
133
250
|
class BaseQualifierPlugin(BasePlugin):
|
|
134
251
|
"""
|
|
135
|
-
Stage 3 plugin:
|
|
252
|
+
Stage 3 plugin: Qualify entities with identifiers and canonical forms.
|
|
253
|
+
|
|
254
|
+
Processes entities of specific types and adds:
|
|
255
|
+
- Semantic qualifiers (role, org for PERSON entities)
|
|
256
|
+
- External identifiers (LEI, company number, SEC CIK)
|
|
257
|
+
- Canonical name and FQN (fully qualified name)
|
|
136
258
|
|
|
137
|
-
|
|
138
|
-
(role, org) or external identifiers (LEI, company number).
|
|
259
|
+
Returns a CanonicalEntity ready for use in labeled statements.
|
|
139
260
|
"""
|
|
140
261
|
|
|
141
262
|
@property
|
|
@@ -167,67 +288,26 @@ class BaseQualifierPlugin(BasePlugin):
|
|
|
167
288
|
self,
|
|
168
289
|
entity: "ExtractedEntity",
|
|
169
290
|
context: "PipelineContext",
|
|
170
|
-
) -> "
|
|
291
|
+
) -> "CanonicalEntity | None":
|
|
171
292
|
"""
|
|
172
|
-
|
|
293
|
+
Qualify an entity and return its canonical form.
|
|
294
|
+
|
|
295
|
+
This method should:
|
|
296
|
+
1. Look up identifiers (LEI, CIK, company number, etc.)
|
|
297
|
+
2. Find the canonical name if available
|
|
298
|
+
3. Generate the FQN (fully qualified name)
|
|
299
|
+
4. Return a CanonicalEntity with all information
|
|
173
300
|
|
|
174
301
|
Args:
|
|
175
302
|
entity: The entity to qualify
|
|
176
303
|
context: Pipeline context (for accessing source text, other entities)
|
|
177
304
|
|
|
178
305
|
Returns:
|
|
179
|
-
|
|
306
|
+
CanonicalEntity with qualifiers and FQN, or None if entity not found
|
|
180
307
|
"""
|
|
181
308
|
...
|
|
182
309
|
|
|
183
310
|
|
|
184
|
-
class BaseCanonicalizerPlugin(BasePlugin):
|
|
185
|
-
"""
|
|
186
|
-
Stage 4 plugin: Resolve entities to canonical forms.
|
|
187
|
-
|
|
188
|
-
Takes qualified entities and finds their canonical representations
|
|
189
|
-
using various matching strategies (identifier, name, fuzzy, LLM).
|
|
190
|
-
"""
|
|
191
|
-
|
|
192
|
-
@property
|
|
193
|
-
@abstractmethod
|
|
194
|
-
def supported_entity_types(self) -> set["EntityType"]:
|
|
195
|
-
"""Entity types this plugin can canonicalize."""
|
|
196
|
-
...
|
|
197
|
-
|
|
198
|
-
@abstractmethod
|
|
199
|
-
def find_canonical(
|
|
200
|
-
self,
|
|
201
|
-
entity: "QualifiedEntity",
|
|
202
|
-
context: "PipelineContext",
|
|
203
|
-
) -> "CanonicalMatch | None":
|
|
204
|
-
"""
|
|
205
|
-
Find canonical form for an entity.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
entity: Qualified entity to canonicalize
|
|
209
|
-
context: Pipeline context
|
|
210
|
-
|
|
211
|
-
Returns:
|
|
212
|
-
CanonicalMatch if found, None otherwise
|
|
213
|
-
"""
|
|
214
|
-
...
|
|
215
|
-
|
|
216
|
-
def format_fqn(
|
|
217
|
-
self,
|
|
218
|
-
entity: "QualifiedEntity",
|
|
219
|
-
match: "CanonicalMatch | None",
|
|
220
|
-
) -> str:
|
|
221
|
-
"""
|
|
222
|
-
Format the fully qualified name for display.
|
|
223
|
-
|
|
224
|
-
Can be overridden by subclasses for custom formatting.
|
|
225
|
-
Default implementation uses CanonicalEntity._generate_fqn.
|
|
226
|
-
"""
|
|
227
|
-
from ..models import CanonicalEntity
|
|
228
|
-
return CanonicalEntity._generate_fqn(entity, match)
|
|
229
|
-
|
|
230
|
-
|
|
231
311
|
class ClassificationSchema:
|
|
232
312
|
"""
|
|
233
313
|
Schema for simple multi-choice classification (2-20 choices).
|
|
@@ -309,7 +389,7 @@ class TaxonomySchema:
|
|
|
309
389
|
|
|
310
390
|
class BaseLabelerPlugin(BasePlugin):
|
|
311
391
|
"""
|
|
312
|
-
Stage
|
|
392
|
+
Stage 4 plugin: Apply labels to statements.
|
|
313
393
|
|
|
314
394
|
Adds classification labels (sentiment, relation type, confidence)
|
|
315
395
|
to the final labeled statements.
|
|
@@ -380,7 +460,7 @@ class BaseLabelerPlugin(BasePlugin):
|
|
|
380
460
|
|
|
381
461
|
class BaseTaxonomyPlugin(BasePlugin):
|
|
382
462
|
"""
|
|
383
|
-
Stage
|
|
463
|
+
Stage 5 plugin: Classify statements against a taxonomy.
|
|
384
464
|
|
|
385
465
|
Taxonomy classification is separate from labeling because:
|
|
386
466
|
- It operates on large taxonomies (100s-1000s of labels)
|
|
@@ -444,3 +524,193 @@ class BaseTaxonomyPlugin(BasePlugin):
|
|
|
444
524
|
List of TaxonomyResult objects (empty if none above threshold)
|
|
445
525
|
"""
|
|
446
526
|
...
|
|
527
|
+
|
|
528
|
+
def classify_batch(
|
|
529
|
+
self,
|
|
530
|
+
items: list[tuple["PipelineStatement", "CanonicalEntity", "CanonicalEntity"]],
|
|
531
|
+
context: "PipelineContext",
|
|
532
|
+
) -> list[list["TaxonomyResult"]]:
|
|
533
|
+
"""
|
|
534
|
+
Classify multiple statements against the taxonomy in a single batch.
|
|
535
|
+
|
|
536
|
+
Default implementation calls classify() for each statement sequentially.
|
|
537
|
+
Plugins with BATCH_PROCESSING capability should override this
|
|
538
|
+
for efficient GPU batching using get_optimal_batch_size().
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
items: List of (statement, subject_canonical, object_canonical) tuples
|
|
542
|
+
context: Pipeline context
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
List of TaxonomyResult lists, one per input statement
|
|
546
|
+
"""
|
|
547
|
+
return [
|
|
548
|
+
self.classify(stmt, subj, obj, context)
|
|
549
|
+
for stmt, subj, obj in items
|
|
550
|
+
]
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
# =============================================================================
|
|
554
|
+
# Content Acquisition Plugins (for URL processing)
|
|
555
|
+
# =============================================================================
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
class ContentType(str, Enum):
|
|
559
|
+
"""Content type detected from URL or HTTP response."""
|
|
560
|
+
HTML = "html"
|
|
561
|
+
PDF = "pdf"
|
|
562
|
+
BINARY = "binary"
|
|
563
|
+
UNKNOWN = "unknown"
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
class ScraperResult(BaseModel):
|
|
567
|
+
"""Result from a scraper plugin."""
|
|
568
|
+
url: str = Field(description="Original URL requested")
|
|
569
|
+
final_url: str = Field(description="Final URL after redirects")
|
|
570
|
+
content: bytes = Field(description="Raw content bytes")
|
|
571
|
+
content_type: ContentType = Field(description="Detected content type")
|
|
572
|
+
headers: dict[str, str] = Field(default_factory=dict, description="Response headers")
|
|
573
|
+
error: Optional[str] = Field(default=None, description="Error message if fetch failed")
|
|
574
|
+
|
|
575
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
576
|
+
|
|
577
|
+
@property
|
|
578
|
+
def ok(self) -> bool:
|
|
579
|
+
"""Check if the fetch was successful."""
|
|
580
|
+
return self.error is None and len(self.content) > 0
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
class PDFParseResult(BaseModel):
|
|
584
|
+
"""Result from a PDF parser plugin."""
|
|
585
|
+
pages: list[str] = Field(description="Extracted text for each page")
|
|
586
|
+
page_count: int = Field(description="Total number of pages in PDF")
|
|
587
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="PDF metadata (title, author, etc)")
|
|
588
|
+
error: Optional[str] = Field(default=None, description="Error message if parsing failed")
|
|
589
|
+
|
|
590
|
+
@property
|
|
591
|
+
def ok(self) -> bool:
|
|
592
|
+
"""Check if parsing was successful."""
|
|
593
|
+
return self.error is None
|
|
594
|
+
|
|
595
|
+
@property
|
|
596
|
+
def full_text(self) -> str:
|
|
597
|
+
"""Get concatenated text from all pages."""
|
|
598
|
+
return "\n\n".join(self.pages)
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
class BaseScraperPlugin(BasePlugin):
|
|
602
|
+
"""
|
|
603
|
+
Plugin for fetching content from URLs.
|
|
604
|
+
|
|
605
|
+
Scrapers handle HTTP requests, redirects, retries, and content type detection.
|
|
606
|
+
They return raw bytes that can be processed by appropriate parsers (HTML, PDF, etc).
|
|
607
|
+
|
|
608
|
+
Example implementation:
|
|
609
|
+
@PluginRegistry.scraper
|
|
610
|
+
class MyScraperPlugin(BaseScraperPlugin):
|
|
611
|
+
@property
|
|
612
|
+
def name(self) -> str:
|
|
613
|
+
return "my_scraper"
|
|
614
|
+
|
|
615
|
+
async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
|
|
616
|
+
# Implement fetching logic
|
|
617
|
+
...
|
|
618
|
+
"""
|
|
619
|
+
|
|
620
|
+
@property
|
|
621
|
+
def capabilities(self) -> PluginCapability:
|
|
622
|
+
"""Scrapers support async processing by default."""
|
|
623
|
+
return PluginCapability.ASYNC_PROCESSING | PluginCapability.EXTERNAL_API
|
|
624
|
+
|
|
625
|
+
@abstractmethod
|
|
626
|
+
async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
|
|
627
|
+
"""
|
|
628
|
+
Fetch content from a URL.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
url: The URL to fetch
|
|
632
|
+
timeout: Request timeout in seconds
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
ScraperResult with content, content type, and any errors
|
|
636
|
+
"""
|
|
637
|
+
...
|
|
638
|
+
|
|
639
|
+
async def head(self, url: str, timeout: float = 10.0) -> ScraperResult:
|
|
640
|
+
"""
|
|
641
|
+
Check content type without downloading the full body.
|
|
642
|
+
|
|
643
|
+
Default implementation does a full fetch. Override for efficiency.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
url: The URL to check
|
|
647
|
+
timeout: Request timeout in seconds
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
ScraperResult with content_type populated (content may be empty)
|
|
651
|
+
"""
|
|
652
|
+
return await self.fetch(url, timeout)
|
|
653
|
+
|
|
654
|
+
def is_supported_url(self, url: str) -> bool:
|
|
655
|
+
"""
|
|
656
|
+
Check if this scraper can handle the URL.
|
|
657
|
+
|
|
658
|
+
Override to restrict to specific URL patterns or domains.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
url: The URL to check
|
|
662
|
+
|
|
663
|
+
Returns:
|
|
664
|
+
True if this scraper can handle the URL
|
|
665
|
+
"""
|
|
666
|
+
return True
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
class BasePDFParserPlugin(BasePlugin):
|
|
670
|
+
"""
|
|
671
|
+
Plugin for extracting text from PDF files.
|
|
672
|
+
|
|
673
|
+
PDF parsers take raw PDF bytes and extract text content page by page.
|
|
674
|
+
They may support OCR for image-heavy PDFs.
|
|
675
|
+
|
|
676
|
+
Example implementation:
|
|
677
|
+
@PluginRegistry.pdf_parser
|
|
678
|
+
class MyPDFParserPlugin(BasePDFParserPlugin):
|
|
679
|
+
@property
|
|
680
|
+
def name(self) -> str:
|
|
681
|
+
return "my_pdf_parser"
|
|
682
|
+
|
|
683
|
+
def parse(self, pdf_bytes: bytes, ...) -> PDFParseResult:
|
|
684
|
+
# Implement parsing logic
|
|
685
|
+
...
|
|
686
|
+
"""
|
|
687
|
+
|
|
688
|
+
@abstractmethod
|
|
689
|
+
def parse(
|
|
690
|
+
self,
|
|
691
|
+
pdf_bytes: bytes,
|
|
692
|
+
max_pages: int = 500,
|
|
693
|
+
use_ocr: bool = False,
|
|
694
|
+
) -> PDFParseResult:
|
|
695
|
+
"""
|
|
696
|
+
Extract text from PDF bytes.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
pdf_bytes: Raw PDF file content
|
|
700
|
+
max_pages: Maximum number of pages to process
|
|
701
|
+
use_ocr: Force OCR even for text-extractable PDFs
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
PDFParseResult with extracted text for each page
|
|
705
|
+
"""
|
|
706
|
+
...
|
|
707
|
+
|
|
708
|
+
@property
|
|
709
|
+
def supports_ocr(self) -> bool:
|
|
710
|
+
"""
|
|
711
|
+
Whether this parser supports OCR for image-heavy PDFs.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
True if OCR is available
|
|
715
|
+
"""
|
|
716
|
+
return False
|
|
@@ -180,6 +180,16 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
180
180
|
def description(self) -> str:
|
|
181
181
|
return "GLiNER2 model for entity and relation extraction"
|
|
182
182
|
|
|
183
|
+
@property
|
|
184
|
+
def model_vram_gb(self) -> float:
|
|
185
|
+
"""GLiNER2 model weights ~0.8GB."""
|
|
186
|
+
return 0.8
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def per_item_vram_gb(self) -> float:
|
|
190
|
+
"""Each triple during batch processing ~0.1GB."""
|
|
191
|
+
return 0.1
|
|
192
|
+
|
|
183
193
|
def _get_model(self):
|
|
184
194
|
"""Lazy-load the GLiNER2 model."""
|
|
185
195
|
if self._model is None:
|
|
@@ -8,9 +8,19 @@ there are too many possible values for simple multi-choice classification.
|
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Optional
|
|
11
|
+
from typing import Optional, TypedDict
|
|
12
12
|
|
|
13
13
|
from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TaxonomyEntry(TypedDict):
|
|
17
|
+
"""Structure for each taxonomy label entry."""
|
|
18
|
+
description: str
|
|
19
|
+
id: int
|
|
20
|
+
mnli_label: str
|
|
21
|
+
embedding_label: str
|
|
22
|
+
|
|
23
|
+
|
|
14
24
|
from ...pipeline.context import PipelineContext
|
|
15
25
|
from ...models import (
|
|
16
26
|
PipelineStatement,
|
|
@@ -214,7 +224,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
|
|
|
214
224
|
self._top_k_categories = top_k_categories
|
|
215
225
|
self._min_confidence = min_confidence
|
|
216
226
|
|
|
217
|
-
self._taxonomy: Optional[dict[str, dict[str,
|
|
227
|
+
self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
|
|
218
228
|
self._classifier: Optional[TaxonomyClassifier] = None
|
|
219
229
|
|
|
220
230
|
@property
|
|
@@ -250,7 +260,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
|
|
|
250
260
|
scope="statement",
|
|
251
261
|
)
|
|
252
262
|
|
|
253
|
-
def _load_taxonomy(self) -> dict[str, dict[str,
|
|
263
|
+
def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
|
|
254
264
|
"""Load taxonomy from JSON file."""
|
|
255
265
|
if self._taxonomy is not None:
|
|
256
266
|
return self._taxonomy
|
|
@@ -358,12 +368,15 @@ class TaxonomyLabeler(BaseLabelerPlugin):
|
|
|
358
368
|
taxonomy = self._load_taxonomy()
|
|
359
369
|
|
|
360
370
|
if category and category in taxonomy:
|
|
361
|
-
|
|
371
|
+
entry = taxonomy[category].get(label)
|
|
372
|
+
if entry:
|
|
373
|
+
return entry.get("id")
|
|
362
374
|
|
|
363
375
|
# Search all categories for flat classification
|
|
364
376
|
for cat_labels in taxonomy.values():
|
|
365
377
|
if label in cat_labels:
|
|
366
|
-
|
|
378
|
+
entry = cat_labels[label]
|
|
379
|
+
return entry.get("id")
|
|
367
380
|
|
|
368
381
|
return None
|
|
369
382
|
|
|
@@ -11,10 +11,19 @@ import json
|
|
|
11
11
|
import logging
|
|
12
12
|
import time
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Optional
|
|
14
|
+
from typing import Optional, TypedDict
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
|
|
18
|
+
|
|
19
|
+
class TaxonomyEntry(TypedDict):
|
|
20
|
+
"""Structure for each taxonomy label entry."""
|
|
21
|
+
description: str
|
|
22
|
+
id: int
|
|
23
|
+
mnli_label: str
|
|
24
|
+
embedding_label: str
|
|
25
|
+
|
|
26
|
+
|
|
18
27
|
from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
|
|
19
28
|
from ...pipeline.context import PipelineContext
|
|
20
29
|
from ...models import (
|
|
@@ -106,14 +115,14 @@ class EmbeddingClassifier:
|
|
|
106
115
|
|
|
107
116
|
def precompute_label_embeddings(
|
|
108
117
|
self,
|
|
109
|
-
taxonomy: dict[str, dict[str,
|
|
118
|
+
taxonomy: dict[str, dict[str, TaxonomyEntry]],
|
|
110
119
|
categories: Optional[list[str]] = None,
|
|
111
120
|
) -> None:
|
|
112
121
|
"""
|
|
113
122
|
Pre-compute embeddings for all label names.
|
|
114
123
|
|
|
115
124
|
Args:
|
|
116
|
-
taxonomy: Taxonomy dict {category: {label:
|
|
125
|
+
taxonomy: Taxonomy dict {category: {label: TaxonomyEntry, ...}, ...}
|
|
117
126
|
categories: Categories to include (default: all)
|
|
118
127
|
"""
|
|
119
128
|
self._load_model()
|
|
@@ -314,7 +323,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
|
|
|
314
323
|
self._top_k_categories = top_k_categories
|
|
315
324
|
self._min_confidence = min_confidence
|
|
316
325
|
|
|
317
|
-
self._taxonomy: Optional[dict[str, dict[str,
|
|
326
|
+
self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
|
|
318
327
|
self._classifier: Optional[EmbeddingClassifier] = None
|
|
319
328
|
self._embeddings_computed = False
|
|
320
329
|
|
|
@@ -350,7 +359,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
|
|
|
350
359
|
scope="statement",
|
|
351
360
|
)
|
|
352
361
|
|
|
353
|
-
def _load_taxonomy(self) -> dict[str, dict[str,
|
|
362
|
+
def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
|
|
354
363
|
"""Load taxonomy from JSON file."""
|
|
355
364
|
if self._taxonomy is not None:
|
|
356
365
|
return self._taxonomy
|
|
@@ -456,7 +465,9 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
|
|
|
456
465
|
taxonomy = self._load_taxonomy()
|
|
457
466
|
|
|
458
467
|
if category in taxonomy:
|
|
459
|
-
|
|
468
|
+
entry = taxonomy[category].get(label)
|
|
469
|
+
if entry:
|
|
470
|
+
return entry.get("id")
|
|
460
471
|
|
|
461
472
|
return None
|
|
462
473
|
|