corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,716 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base plugin classes for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
Defines the abstract interfaces for each pipeline stage:
|
|
5
|
+
- BaseSplitterPlugin: Stage 1 - Text → RawTriple
|
|
6
|
+
- BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
|
|
7
|
+
- BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
|
|
8
|
+
- BaseLabelerPlugin: Stage 4 - Statement → StatementLabel
|
|
9
|
+
- BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
|
|
10
|
+
|
|
11
|
+
Content acquisition plugins (for URL processing):
|
|
12
|
+
- BaseScraperPlugin: Fetch content from URLs
|
|
13
|
+
- BasePDFParserPlugin: Extract text from PDFs
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from enum import Enum, Flag, auto
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
19
|
+
|
|
20
|
+
from pydantic import BaseModel, Field
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from ..pipeline.context import PipelineContext
|
|
24
|
+
from ..models import (
|
|
25
|
+
RawTriple,
|
|
26
|
+
PipelineStatement,
|
|
27
|
+
ExtractedEntity,
|
|
28
|
+
CanonicalEntity,
|
|
29
|
+
StatementLabel,
|
|
30
|
+
TaxonomyResult,
|
|
31
|
+
EntityType,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PluginCapability(Flag):
|
|
36
|
+
"""Flags indicating plugin capabilities."""
|
|
37
|
+
NONE = 0
|
|
38
|
+
BATCH_PROCESSING = auto() # Can process multiple items at once
|
|
39
|
+
ASYNC_PROCESSING = auto() # Supports async execution
|
|
40
|
+
EXTERNAL_API = auto() # Uses external API (may have rate limits)
|
|
41
|
+
LLM_REQUIRED = auto() # Requires an LLM model
|
|
42
|
+
CACHING = auto() # Supports result caching
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_available_vram_gb() -> float:
|
|
46
|
+
"""
|
|
47
|
+
Get available GPU VRAM in GB.
|
|
48
|
+
|
|
49
|
+
Returns 0.0 if no GPU is available or VRAM cannot be determined.
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
import torch
|
|
53
|
+
if torch.cuda.is_available():
|
|
54
|
+
device = torch.cuda.current_device()
|
|
55
|
+
total = torch.cuda.get_device_properties(device).total_memory
|
|
56
|
+
allocated = torch.cuda.memory_allocated(device)
|
|
57
|
+
available = (total - allocated) / (1024 ** 3) # Convert to GB
|
|
58
|
+
return available
|
|
59
|
+
elif torch.backends.mps.is_available():
|
|
60
|
+
# MPS doesn't expose VRAM info; estimate based on typical Apple Silicon
|
|
61
|
+
# Return a conservative estimate
|
|
62
|
+
return 8.0
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
return 0.0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def calculate_batch_size(
|
|
69
|
+
per_item_vram_gb: float,
|
|
70
|
+
overhead_gb: float = 2.0,
|
|
71
|
+
min_batch: int = 1,
|
|
72
|
+
max_batch: int = 32,
|
|
73
|
+
) -> int:
|
|
74
|
+
"""
|
|
75
|
+
Calculate optimal batch size based on available VRAM.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
per_item_vram_gb: VRAM required per item in GB
|
|
79
|
+
overhead_gb: Reserved VRAM for model weights and system overhead
|
|
80
|
+
min_batch: Minimum batch size
|
|
81
|
+
max_batch: Maximum batch size cap
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Optimal batch size for the current GPU
|
|
85
|
+
"""
|
|
86
|
+
available = get_available_vram_gb()
|
|
87
|
+
if available <= 0 or per_item_vram_gb <= 0:
|
|
88
|
+
return min_batch
|
|
89
|
+
|
|
90
|
+
usable = max(0, available - overhead_gb)
|
|
91
|
+
batch_size = int(usable / per_item_vram_gb)
|
|
92
|
+
return max(min_batch, min(batch_size, max_batch))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class BasePlugin(ABC):
|
|
96
|
+
"""
|
|
97
|
+
Base class for all pipeline plugins.
|
|
98
|
+
|
|
99
|
+
All plugins must implement the name property and can optionally
|
|
100
|
+
override priority and capabilities.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
@abstractmethod
|
|
105
|
+
def name(self) -> str:
|
|
106
|
+
"""Unique name for this plugin (used for registration and CLI)."""
|
|
107
|
+
...
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def priority(self) -> int:
|
|
111
|
+
"""
|
|
112
|
+
Plugin priority (lower = higher priority, runs first).
|
|
113
|
+
|
|
114
|
+
Default is 100. Use lower values (e.g., 10, 20) for critical plugins
|
|
115
|
+
that should run before others.
|
|
116
|
+
"""
|
|
117
|
+
return 100
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def capabilities(self) -> PluginCapability:
|
|
121
|
+
"""Plugin capabilities (flags)."""
|
|
122
|
+
return PluginCapability.NONE
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def description(self) -> str:
|
|
126
|
+
"""Human-readable description of this plugin."""
|
|
127
|
+
return ""
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def model_vram_gb(self) -> float:
|
|
131
|
+
"""
|
|
132
|
+
Estimated VRAM required for model weights in GB.
|
|
133
|
+
|
|
134
|
+
Override this if the plugin loads a GPU model. This is used to
|
|
135
|
+
reserve memory overhead when calculating batch sizes.
|
|
136
|
+
|
|
137
|
+
Default is 0.0 (no GPU model).
|
|
138
|
+
"""
|
|
139
|
+
return 0.0
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def per_item_vram_gb(self) -> float:
|
|
143
|
+
"""
|
|
144
|
+
Estimated VRAM required per item during batch processing in GB.
|
|
145
|
+
|
|
146
|
+
Override this for plugins with BATCH_PROCESSING capability.
|
|
147
|
+
Used to calculate optimal batch size: batch = (available - overhead) / per_item
|
|
148
|
+
|
|
149
|
+
Default is 0.1 GB (100MB) as a conservative estimate.
|
|
150
|
+
"""
|
|
151
|
+
return 0.1
|
|
152
|
+
|
|
153
|
+
def get_optimal_batch_size(self, max_batch: int = 32) -> int:
|
|
154
|
+
"""
|
|
155
|
+
Calculate optimal batch size based on available VRAM and plugin requirements.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
max_batch: Maximum batch size cap
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Optimal batch size for current GPU state
|
|
162
|
+
"""
|
|
163
|
+
if not (PluginCapability.BATCH_PROCESSING in self.capabilities):
|
|
164
|
+
return 1
|
|
165
|
+
|
|
166
|
+
return calculate_batch_size(
|
|
167
|
+
per_item_vram_gb=self.per_item_vram_gb,
|
|
168
|
+
overhead_gb=self.model_vram_gb + 1.0, # Add 1GB system overhead
|
|
169
|
+
min_batch=1,
|
|
170
|
+
max_batch=max_batch,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class BaseSplitterPlugin(BasePlugin):
|
|
175
|
+
"""
|
|
176
|
+
Stage 1 plugin: Split text into atomic triples.
|
|
177
|
+
|
|
178
|
+
Takes raw text and produces RawTriple objects containing
|
|
179
|
+
subject/predicate/object text and source sentence.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
@abstractmethod
|
|
183
|
+
def split(
|
|
184
|
+
self,
|
|
185
|
+
text: str,
|
|
186
|
+
context: "PipelineContext",
|
|
187
|
+
) -> list["RawTriple"]:
|
|
188
|
+
"""
|
|
189
|
+
Split text into atomic triples.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
text: Input text to split
|
|
193
|
+
context: Pipeline context for accessing metadata and config
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of RawTriple objects
|
|
197
|
+
"""
|
|
198
|
+
...
|
|
199
|
+
|
|
200
|
+
def split_batch(
|
|
201
|
+
self,
|
|
202
|
+
texts: list[str],
|
|
203
|
+
context: "PipelineContext",
|
|
204
|
+
) -> list[list["RawTriple"]]:
|
|
205
|
+
"""
|
|
206
|
+
Split multiple texts into atomic triples in a single batch.
|
|
207
|
+
|
|
208
|
+
Default implementation calls split() for each text sequentially.
|
|
209
|
+
Plugins with BATCH_PROCESSING capability should override this
|
|
210
|
+
for efficient GPU batching using get_optimal_batch_size().
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
texts: List of input texts to split
|
|
214
|
+
context: Pipeline context for accessing metadata and config
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of RawTriple lists, one per input text
|
|
218
|
+
"""
|
|
219
|
+
return [self.split(text, context) for text in texts]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class BaseExtractorPlugin(BasePlugin):
|
|
223
|
+
"""
|
|
224
|
+
Stage 2 plugin: Refine triples into statements with typed entities.
|
|
225
|
+
|
|
226
|
+
Takes RawTriple objects and produces PipelineStatement objects
|
|
227
|
+
with ExtractedEntity subjects/objects that have types, spans,
|
|
228
|
+
and confidence scores.
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
@abstractmethod
|
|
232
|
+
def extract(
|
|
233
|
+
self,
|
|
234
|
+
raw_triples: list["RawTriple"],
|
|
235
|
+
context: "PipelineContext",
|
|
236
|
+
) -> list["PipelineStatement"]:
|
|
237
|
+
"""
|
|
238
|
+
Extract statements from raw triples.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
raw_triples: Raw triples from Stage 1
|
|
242
|
+
context: Pipeline context
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List of PipelineStatement objects with typed entities
|
|
246
|
+
"""
|
|
247
|
+
...
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class BaseQualifierPlugin(BasePlugin):
|
|
251
|
+
"""
|
|
252
|
+
Stage 3 plugin: Qualify entities with identifiers and canonical forms.
|
|
253
|
+
|
|
254
|
+
Processes entities of specific types and adds:
|
|
255
|
+
- Semantic qualifiers (role, org for PERSON entities)
|
|
256
|
+
- External identifiers (LEI, company number, SEC CIK)
|
|
257
|
+
- Canonical name and FQN (fully qualified name)
|
|
258
|
+
|
|
259
|
+
Returns a CanonicalEntity ready for use in labeled statements.
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
@abstractmethod
|
|
264
|
+
def supported_entity_types(self) -> set["EntityType"]:
|
|
265
|
+
"""Entity types this plugin can qualify (e.g., {ORG, PERSON})."""
|
|
266
|
+
...
|
|
267
|
+
|
|
268
|
+
@property
|
|
269
|
+
def supported_identifier_types(self) -> list[str]:
|
|
270
|
+
"""
|
|
271
|
+
Identifier types this plugin can use for lookup.
|
|
272
|
+
|
|
273
|
+
For example, GLEIFQualifier can lookup by 'lei'.
|
|
274
|
+
"""
|
|
275
|
+
return []
|
|
276
|
+
|
|
277
|
+
@property
|
|
278
|
+
def provided_identifier_types(self) -> list[str]:
|
|
279
|
+
"""
|
|
280
|
+
Identifier types this plugin can provide.
|
|
281
|
+
|
|
282
|
+
For example, GLEIFQualifier provides 'lei', 'jurisdiction'.
|
|
283
|
+
"""
|
|
284
|
+
return []
|
|
285
|
+
|
|
286
|
+
@abstractmethod
|
|
287
|
+
def qualify(
|
|
288
|
+
self,
|
|
289
|
+
entity: "ExtractedEntity",
|
|
290
|
+
context: "PipelineContext",
|
|
291
|
+
) -> "CanonicalEntity | None":
|
|
292
|
+
"""
|
|
293
|
+
Qualify an entity and return its canonical form.
|
|
294
|
+
|
|
295
|
+
This method should:
|
|
296
|
+
1. Look up identifiers (LEI, CIK, company number, etc.)
|
|
297
|
+
2. Find the canonical name if available
|
|
298
|
+
3. Generate the FQN (fully qualified name)
|
|
299
|
+
4. Return a CanonicalEntity with all information
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
entity: The entity to qualify
|
|
303
|
+
context: Pipeline context (for accessing source text, other entities)
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
CanonicalEntity with qualifiers and FQN, or None if entity not found
|
|
307
|
+
"""
|
|
308
|
+
...
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class ClassificationSchema:
|
|
312
|
+
"""
|
|
313
|
+
Schema for simple multi-choice classification (2-20 choices).
|
|
314
|
+
|
|
315
|
+
Handled by GLiNER2 `.classification()` in a single pass.
|
|
316
|
+
|
|
317
|
+
Examples:
|
|
318
|
+
- sentiment: ["positive", "negative", "neutral"]
|
|
319
|
+
- certainty: ["certain", "uncertain", "speculative"]
|
|
320
|
+
- temporality: ["past", "present", "future"]
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
def __init__(
|
|
324
|
+
self,
|
|
325
|
+
label_type: str,
|
|
326
|
+
choices: list[str],
|
|
327
|
+
description: str = "",
|
|
328
|
+
scope: str = "statement", # "statement", "subject", "object", "predicate"
|
|
329
|
+
):
|
|
330
|
+
self.label_type = label_type
|
|
331
|
+
self.choices = choices
|
|
332
|
+
self.description = description
|
|
333
|
+
self.scope = scope
|
|
334
|
+
|
|
335
|
+
def __repr__(self) -> str:
|
|
336
|
+
return f"ClassificationSchema({self.label_type!r}, choices={self.choices!r})"
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
class TaxonomySchema:
|
|
340
|
+
"""
|
|
341
|
+
Schema for large taxonomy labeling (100s of values).
|
|
342
|
+
|
|
343
|
+
Too many choices for GLiNER2 classification. Requires MNLI or similar:
|
|
344
|
+
- MNLI zero-shot with label descriptions
|
|
345
|
+
- Embedding-based nearest neighbor search
|
|
346
|
+
- Hierarchical classification (category → subcategory)
|
|
347
|
+
|
|
348
|
+
Examples:
|
|
349
|
+
- industry_code: NAICS/SIC codes (1000+ values)
|
|
350
|
+
- relation_type: detailed relation ontology (100+ types)
|
|
351
|
+
- job_title: standardized job taxonomy
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
def __init__(
|
|
355
|
+
self,
|
|
356
|
+
label_type: str,
|
|
357
|
+
values: list[str] | dict[str, list[str]], # flat list or hierarchical dict
|
|
358
|
+
description: str = "",
|
|
359
|
+
scope: str = "statement", # "statement", "subject", "object", "predicate"
|
|
360
|
+
label_descriptions: dict[str, str] | None = None, # descriptions for MNLI
|
|
361
|
+
):
|
|
362
|
+
self.label_type = label_type
|
|
363
|
+
self.values = values
|
|
364
|
+
self.description = description
|
|
365
|
+
self.scope = scope
|
|
366
|
+
self.label_descriptions = label_descriptions # e.g., {"NAICS:5112": "Software Publishers"}
|
|
367
|
+
|
|
368
|
+
@property
|
|
369
|
+
def is_hierarchical(self) -> bool:
|
|
370
|
+
"""Check if taxonomy is hierarchical (dict) vs flat (list)."""
|
|
371
|
+
return isinstance(self.values, dict)
|
|
372
|
+
|
|
373
|
+
@property
|
|
374
|
+
def all_values(self) -> list[str]:
|
|
375
|
+
"""Get all taxonomy values (flattened if hierarchical)."""
|
|
376
|
+
if isinstance(self.values, list):
|
|
377
|
+
return self.values
|
|
378
|
+
# Flatten hierarchical dict
|
|
379
|
+
result = []
|
|
380
|
+
for category, subcategories in self.values.items():
|
|
381
|
+
result.append(category)
|
|
382
|
+
result.extend(subcategories)
|
|
383
|
+
return result
|
|
384
|
+
|
|
385
|
+
def __repr__(self) -> str:
|
|
386
|
+
count = len(self.all_values)
|
|
387
|
+
return f"TaxonomySchema({self.label_type!r}, {count} values)"
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class BaseLabelerPlugin(BasePlugin):
|
|
391
|
+
"""
|
|
392
|
+
Stage 4 plugin: Apply labels to statements.
|
|
393
|
+
|
|
394
|
+
Adds classification labels (sentiment, relation type, confidence)
|
|
395
|
+
to the final labeled statements.
|
|
396
|
+
|
|
397
|
+
Labelers can provide a classification_schema that extractors will use
|
|
398
|
+
to run classification in a single model pass. The results are stored
|
|
399
|
+
in the pipeline context for the labeler to retrieve.
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
@property
|
|
403
|
+
@abstractmethod
|
|
404
|
+
def label_type(self) -> str:
|
|
405
|
+
"""
|
|
406
|
+
The type of label this plugin produces.
|
|
407
|
+
|
|
408
|
+
Examples: 'sentiment', 'relation_type', 'confidence'
|
|
409
|
+
"""
|
|
410
|
+
...
|
|
411
|
+
|
|
412
|
+
@property
|
|
413
|
+
def classification_schema(self) -> ClassificationSchema | None:
|
|
414
|
+
"""
|
|
415
|
+
Simple multi-choice classification schema (2-20 choices).
|
|
416
|
+
|
|
417
|
+
If provided, GLiNER2 extractor will run `.classification()` and store
|
|
418
|
+
results in context for this labeler to retrieve.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
ClassificationSchema or None
|
|
422
|
+
"""
|
|
423
|
+
return None
|
|
424
|
+
|
|
425
|
+
@property
|
|
426
|
+
def taxonomy_schema(self) -> TaxonomySchema | None:
|
|
427
|
+
"""
|
|
428
|
+
Large taxonomy schema (100s of values).
|
|
429
|
+
|
|
430
|
+
If provided, requires MNLI or embedding-based classification.
|
|
431
|
+
Results stored in context for this labeler to retrieve.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
TaxonomySchema or None
|
|
435
|
+
"""
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
@abstractmethod
|
|
439
|
+
def label(
|
|
440
|
+
self,
|
|
441
|
+
statement: "PipelineStatement",
|
|
442
|
+
subject_canonical: "CanonicalEntity",
|
|
443
|
+
object_canonical: "CanonicalEntity",
|
|
444
|
+
context: "PipelineContext",
|
|
445
|
+
) -> "StatementLabel | None":
|
|
446
|
+
"""
|
|
447
|
+
Apply a label to a statement.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
statement: The statement to label
|
|
451
|
+
subject_canonical: Canonicalized subject entity
|
|
452
|
+
object_canonical: Canonicalized object entity
|
|
453
|
+
context: Pipeline context (check context.classification_results for pre-computed labels)
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
StatementLabel if applicable, None otherwise
|
|
457
|
+
"""
|
|
458
|
+
...
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
class BaseTaxonomyPlugin(BasePlugin):
|
|
462
|
+
"""
|
|
463
|
+
Stage 5 plugin: Classify statements against a taxonomy.
|
|
464
|
+
|
|
465
|
+
Taxonomy classification is separate from labeling because:
|
|
466
|
+
- It operates on large taxonomies (100s-1000s of labels)
|
|
467
|
+
- It requires specialized models (MNLI, embeddings)
|
|
468
|
+
- It's computationally heavier than simple labeling
|
|
469
|
+
|
|
470
|
+
Taxonomy plugins produce TaxonomyResult objects that are stored
|
|
471
|
+
in the pipeline context.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
@abstractmethod
|
|
476
|
+
def taxonomy_name(self) -> str:
|
|
477
|
+
"""
|
|
478
|
+
Name of the taxonomy this plugin classifies against.
|
|
479
|
+
|
|
480
|
+
Examples: 'esg_topics', 'industry_codes', 'relation_types'
|
|
481
|
+
"""
|
|
482
|
+
...
|
|
483
|
+
|
|
484
|
+
@property
|
|
485
|
+
def taxonomy_schema(self) -> TaxonomySchema | None:
|
|
486
|
+
"""
|
|
487
|
+
The taxonomy schema this plugin uses.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
TaxonomySchema describing the taxonomy structure
|
|
491
|
+
"""
|
|
492
|
+
return None
|
|
493
|
+
|
|
494
|
+
@property
|
|
495
|
+
def supported_categories(self) -> list[str]:
|
|
496
|
+
"""
|
|
497
|
+
List of taxonomy categories this plugin supports.
|
|
498
|
+
|
|
499
|
+
Returns empty list if all categories are supported.
|
|
500
|
+
"""
|
|
501
|
+
return []
|
|
502
|
+
|
|
503
|
+
@abstractmethod
|
|
504
|
+
def classify(
|
|
505
|
+
self,
|
|
506
|
+
statement: "PipelineStatement",
|
|
507
|
+
subject_canonical: "CanonicalEntity",
|
|
508
|
+
object_canonical: "CanonicalEntity",
|
|
509
|
+
context: "PipelineContext",
|
|
510
|
+
) -> list["TaxonomyResult"]:
|
|
511
|
+
"""
|
|
512
|
+
Classify a statement against the taxonomy.
|
|
513
|
+
|
|
514
|
+
Returns all labels above the confidence threshold. A single statement
|
|
515
|
+
may have multiple applicable taxonomy labels.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
statement: The statement to classify
|
|
519
|
+
subject_canonical: Canonicalized subject entity
|
|
520
|
+
object_canonical: Canonicalized object entity
|
|
521
|
+
context: Pipeline context
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
List of TaxonomyResult objects (empty if none above threshold)
|
|
525
|
+
"""
|
|
526
|
+
...
|
|
527
|
+
|
|
528
|
+
def classify_batch(
|
|
529
|
+
self,
|
|
530
|
+
items: list[tuple["PipelineStatement", "CanonicalEntity", "CanonicalEntity"]],
|
|
531
|
+
context: "PipelineContext",
|
|
532
|
+
) -> list[list["TaxonomyResult"]]:
|
|
533
|
+
"""
|
|
534
|
+
Classify multiple statements against the taxonomy in a single batch.
|
|
535
|
+
|
|
536
|
+
Default implementation calls classify() for each statement sequentially.
|
|
537
|
+
Plugins with BATCH_PROCESSING capability should override this
|
|
538
|
+
for efficient GPU batching using get_optimal_batch_size().
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
items: List of (statement, subject_canonical, object_canonical) tuples
|
|
542
|
+
context: Pipeline context
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
List of TaxonomyResult lists, one per input statement
|
|
546
|
+
"""
|
|
547
|
+
return [
|
|
548
|
+
self.classify(stmt, subj, obj, context)
|
|
549
|
+
for stmt, subj, obj in items
|
|
550
|
+
]
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
# =============================================================================
|
|
554
|
+
# Content Acquisition Plugins (for URL processing)
|
|
555
|
+
# =============================================================================
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
class ContentType(str, Enum):
|
|
559
|
+
"""Content type detected from URL or HTTP response."""
|
|
560
|
+
HTML = "html"
|
|
561
|
+
PDF = "pdf"
|
|
562
|
+
BINARY = "binary"
|
|
563
|
+
UNKNOWN = "unknown"
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
class ScraperResult(BaseModel):
|
|
567
|
+
"""Result from a scraper plugin."""
|
|
568
|
+
url: str = Field(description="Original URL requested")
|
|
569
|
+
final_url: str = Field(description="Final URL after redirects")
|
|
570
|
+
content: bytes = Field(description="Raw content bytes")
|
|
571
|
+
content_type: ContentType = Field(description="Detected content type")
|
|
572
|
+
headers: dict[str, str] = Field(default_factory=dict, description="Response headers")
|
|
573
|
+
error: Optional[str] = Field(default=None, description="Error message if fetch failed")
|
|
574
|
+
|
|
575
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
576
|
+
|
|
577
|
+
@property
|
|
578
|
+
def ok(self) -> bool:
|
|
579
|
+
"""Check if the fetch was successful."""
|
|
580
|
+
return self.error is None and len(self.content) > 0
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
class PDFParseResult(BaseModel):
|
|
584
|
+
"""Result from a PDF parser plugin."""
|
|
585
|
+
pages: list[str] = Field(description="Extracted text for each page")
|
|
586
|
+
page_count: int = Field(description="Total number of pages in PDF")
|
|
587
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="PDF metadata (title, author, etc)")
|
|
588
|
+
error: Optional[str] = Field(default=None, description="Error message if parsing failed")
|
|
589
|
+
|
|
590
|
+
@property
|
|
591
|
+
def ok(self) -> bool:
|
|
592
|
+
"""Check if parsing was successful."""
|
|
593
|
+
return self.error is None
|
|
594
|
+
|
|
595
|
+
@property
|
|
596
|
+
def full_text(self) -> str:
|
|
597
|
+
"""Get concatenated text from all pages."""
|
|
598
|
+
return "\n\n".join(self.pages)
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
class BaseScraperPlugin(BasePlugin):
|
|
602
|
+
"""
|
|
603
|
+
Plugin for fetching content from URLs.
|
|
604
|
+
|
|
605
|
+
Scrapers handle HTTP requests, redirects, retries, and content type detection.
|
|
606
|
+
They return raw bytes that can be processed by appropriate parsers (HTML, PDF, etc).
|
|
607
|
+
|
|
608
|
+
Example implementation:
|
|
609
|
+
@PluginRegistry.scraper
|
|
610
|
+
class MyScraperPlugin(BaseScraperPlugin):
|
|
611
|
+
@property
|
|
612
|
+
def name(self) -> str:
|
|
613
|
+
return "my_scraper"
|
|
614
|
+
|
|
615
|
+
async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
|
|
616
|
+
# Implement fetching logic
|
|
617
|
+
...
|
|
618
|
+
"""
|
|
619
|
+
|
|
620
|
+
@property
|
|
621
|
+
def capabilities(self) -> PluginCapability:
|
|
622
|
+
"""Scrapers support async processing by default."""
|
|
623
|
+
return PluginCapability.ASYNC_PROCESSING | PluginCapability.EXTERNAL_API
|
|
624
|
+
|
|
625
|
+
@abstractmethod
|
|
626
|
+
async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
|
|
627
|
+
"""
|
|
628
|
+
Fetch content from a URL.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
url: The URL to fetch
|
|
632
|
+
timeout: Request timeout in seconds
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
ScraperResult with content, content type, and any errors
|
|
636
|
+
"""
|
|
637
|
+
...
|
|
638
|
+
|
|
639
|
+
async def head(self, url: str, timeout: float = 10.0) -> ScraperResult:
|
|
640
|
+
"""
|
|
641
|
+
Check content type without downloading the full body.
|
|
642
|
+
|
|
643
|
+
Default implementation does a full fetch. Override for efficiency.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
url: The URL to check
|
|
647
|
+
timeout: Request timeout in seconds
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
ScraperResult with content_type populated (content may be empty)
|
|
651
|
+
"""
|
|
652
|
+
return await self.fetch(url, timeout)
|
|
653
|
+
|
|
654
|
+
def is_supported_url(self, url: str) -> bool:
|
|
655
|
+
"""
|
|
656
|
+
Check if this scraper can handle the URL.
|
|
657
|
+
|
|
658
|
+
Override to restrict to specific URL patterns or domains.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
url: The URL to check
|
|
662
|
+
|
|
663
|
+
Returns:
|
|
664
|
+
True if this scraper can handle the URL
|
|
665
|
+
"""
|
|
666
|
+
return True
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
class BasePDFParserPlugin(BasePlugin):
|
|
670
|
+
"""
|
|
671
|
+
Plugin for extracting text from PDF files.
|
|
672
|
+
|
|
673
|
+
PDF parsers take raw PDF bytes and extract text content page by page.
|
|
674
|
+
They may support OCR for image-heavy PDFs.
|
|
675
|
+
|
|
676
|
+
Example implementation:
|
|
677
|
+
@PluginRegistry.pdf_parser
|
|
678
|
+
class MyPDFParserPlugin(BasePDFParserPlugin):
|
|
679
|
+
@property
|
|
680
|
+
def name(self) -> str:
|
|
681
|
+
return "my_pdf_parser"
|
|
682
|
+
|
|
683
|
+
def parse(self, pdf_bytes: bytes, ...) -> PDFParseResult:
|
|
684
|
+
# Implement parsing logic
|
|
685
|
+
...
|
|
686
|
+
"""
|
|
687
|
+
|
|
688
|
+
@abstractmethod
|
|
689
|
+
def parse(
|
|
690
|
+
self,
|
|
691
|
+
pdf_bytes: bytes,
|
|
692
|
+
max_pages: int = 500,
|
|
693
|
+
use_ocr: bool = False,
|
|
694
|
+
) -> PDFParseResult:
|
|
695
|
+
"""
|
|
696
|
+
Extract text from PDF bytes.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
pdf_bytes: Raw PDF file content
|
|
700
|
+
max_pages: Maximum number of pages to process
|
|
701
|
+
use_ocr: Force OCR even for text-extractable PDFs
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
PDFParseResult with extracted text for each page
|
|
705
|
+
"""
|
|
706
|
+
...
|
|
707
|
+
|
|
708
|
+
@property
|
|
709
|
+
def supports_ocr(self) -> bool:
|
|
710
|
+
"""
|
|
711
|
+
Whether this parser supports OCR for image-heavy PDFs.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
True if OCR is available
|
|
715
|
+
"""
|
|
716
|
+
return False
|