corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -3,15 +3,46 @@ Qualifier models for the extraction pipeline.
|
|
|
3
3
|
|
|
4
4
|
EntityQualifiers: Semantic qualifiers and external identifiers
|
|
5
5
|
QualifiedEntity: Entity with qualification information from Stage 3
|
|
6
|
+
ResolvedRole: Canonical role information from database
|
|
7
|
+
ResolvedOrganization: Canonical organization information from database
|
|
6
8
|
"""
|
|
7
9
|
|
|
8
|
-
from typing import Optional
|
|
10
|
+
from typing import Any, Optional
|
|
9
11
|
|
|
10
12
|
from pydantic import BaseModel, Field
|
|
11
13
|
|
|
12
14
|
from .entity import EntityType
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
class ResolvedRole(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Resolved/canonical role information for a person.
|
|
20
|
+
|
|
21
|
+
Populated when matching a person against the database,
|
|
22
|
+
capturing the canonical role from Wikidata or other sources.
|
|
23
|
+
"""
|
|
24
|
+
canonical_name: str = Field(..., description="Canonical role name (e.g., 'Chief Executive Officer')")
|
|
25
|
+
canonical_id: Optional[str] = Field(None, description="Full canonical ID (e.g., 'wikidata:Q484876')")
|
|
26
|
+
source: str = Field(..., description="Source of resolution (e.g., 'wikidata')")
|
|
27
|
+
source_id: Optional[str] = Field(None, description="ID in the source (e.g., 'Q484876' for Wikidata)")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ResolvedOrganization(BaseModel):
|
|
31
|
+
"""
|
|
32
|
+
Resolved/canonical organization information.
|
|
33
|
+
|
|
34
|
+
Populated when resolving an organization mentioned in context
|
|
35
|
+
against the organization database (GLEIF, SEC, Companies House, Wikidata).
|
|
36
|
+
"""
|
|
37
|
+
canonical_name: str = Field(..., description="Canonical organization name")
|
|
38
|
+
canonical_id: str = Field(..., description="Full canonical ID (e.g., 'LEI:549300XYZ', 'SEC-CIK:1234567')")
|
|
39
|
+
source: str = Field(..., description="Source of resolution (e.g., 'gleif', 'sec_edgar', 'wikidata')")
|
|
40
|
+
source_id: str = Field(..., description="ID in the source")
|
|
41
|
+
region: Optional[str] = Field(None, description="Organization's region/jurisdiction")
|
|
42
|
+
match_confidence: float = Field(default=1.0, description="Confidence in the match (0-1)")
|
|
43
|
+
match_details: Optional[dict[str, Any]] = Field(None, description="Additional match details")
|
|
44
|
+
|
|
45
|
+
|
|
15
46
|
class EntityQualifiers(BaseModel):
|
|
16
47
|
"""
|
|
17
48
|
Qualifiers that provide context and identifiers for an entity.
|
|
@@ -22,6 +53,9 @@ class EntityQualifiers(BaseModel):
|
|
|
22
53
|
- CompaniesHouseQualifierPlugin: Adds UK company number
|
|
23
54
|
- SECEdgarQualifierPlugin: Adds SEC CIK, ticker
|
|
24
55
|
"""
|
|
56
|
+
# Canonical name from database (for ORG entities)
|
|
57
|
+
legal_name: Optional[str] = Field(None, description="Canonical legal name from database")
|
|
58
|
+
|
|
25
59
|
# Semantic qualifiers (for PERSON entities)
|
|
26
60
|
org: Optional[str] = Field(None, description="Organization/employer name")
|
|
27
61
|
role: Optional[str] = Field(None, description="Job title/position/role")
|
|
@@ -38,11 +72,22 @@ class EntityQualifiers(BaseModel):
|
|
|
38
72
|
description="External identifiers: lei, ch_number, sec_cik, ticker, wikidata_qid, etc."
|
|
39
73
|
)
|
|
40
74
|
|
|
75
|
+
# Resolved canonical information (for PERSON entities)
|
|
76
|
+
resolved_role: Optional[ResolvedRole] = Field(
|
|
77
|
+
None,
|
|
78
|
+
description="Canonical role information from database lookup"
|
|
79
|
+
)
|
|
80
|
+
resolved_org: Optional[ResolvedOrganization] = Field(
|
|
81
|
+
None,
|
|
82
|
+
description="Canonical organization information from database lookup"
|
|
83
|
+
)
|
|
84
|
+
|
|
41
85
|
def has_any_qualifier(self) -> bool:
|
|
42
86
|
"""Check if any qualifier or identifier is set."""
|
|
43
87
|
return bool(
|
|
44
|
-
self.org or self.role or self.region or self.country or
|
|
45
|
-
self.city or self.jurisdiction or self.identifiers
|
|
88
|
+
self.legal_name or self.org or self.role or self.region or self.country or
|
|
89
|
+
self.city or self.jurisdiction or self.identifiers or
|
|
90
|
+
self.resolved_role or self.resolved_org
|
|
46
91
|
)
|
|
47
92
|
|
|
48
93
|
def merge_with(self, other: "EntityQualifiers") -> "EntityQualifiers":
|
|
@@ -53,6 +98,7 @@ class EntityQualifiers(BaseModel):
|
|
|
53
98
|
"""
|
|
54
99
|
merged_identifiers = {**self.identifiers, **other.identifiers}
|
|
55
100
|
return EntityQualifiers(
|
|
101
|
+
legal_name=other.legal_name or self.legal_name,
|
|
56
102
|
org=other.org or self.org,
|
|
57
103
|
role=other.role or self.role,
|
|
58
104
|
region=other.region or self.region,
|
|
@@ -60,6 +106,8 @@ class EntityQualifiers(BaseModel):
|
|
|
60
106
|
city=other.city or self.city,
|
|
61
107
|
jurisdiction=other.jurisdiction or self.jurisdiction,
|
|
62
108
|
identifiers=merged_identifiers,
|
|
109
|
+
resolved_role=other.resolved_role or self.resolved_role,
|
|
110
|
+
resolved_org=other.resolved_org or self.resolved_org,
|
|
63
111
|
)
|
|
64
112
|
|
|
65
113
|
|
|
@@ -29,6 +29,19 @@ class RawTriple(BaseModel):
|
|
|
29
29
|
le=1.0,
|
|
30
30
|
description="Extraction confidence from the splitter"
|
|
31
31
|
)
|
|
32
|
+
# Document tracking fields
|
|
33
|
+
document_id: Optional[str] = Field(
|
|
34
|
+
None,
|
|
35
|
+
description="ID of the source document (for document pipeline)"
|
|
36
|
+
)
|
|
37
|
+
page_number: Optional[int] = Field(
|
|
38
|
+
None,
|
|
39
|
+
description="Page number where this triple was extracted (1-indexed)"
|
|
40
|
+
)
|
|
41
|
+
chunk_index: Optional[int] = Field(
|
|
42
|
+
None,
|
|
43
|
+
description="Index of the chunk this triple was extracted from (0-indexed)"
|
|
44
|
+
)
|
|
32
45
|
|
|
33
46
|
def __str__(self) -> str:
|
|
34
47
|
return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
|
|
@@ -63,6 +76,19 @@ class PipelineStatement(BaseModel):
|
|
|
63
76
|
None,
|
|
64
77
|
description="Method used to extract this statement (e.g., 'hybrid', 'gliner', 'model')"
|
|
65
78
|
)
|
|
79
|
+
# Document tracking fields
|
|
80
|
+
document_id: Optional[str] = Field(
|
|
81
|
+
None,
|
|
82
|
+
description="ID of the source document (for document pipeline)"
|
|
83
|
+
)
|
|
84
|
+
page_number: Optional[int] = Field(
|
|
85
|
+
None,
|
|
86
|
+
description="Page number where this statement was extracted (1-indexed)"
|
|
87
|
+
)
|
|
88
|
+
chunk_index: Optional[int] = Field(
|
|
89
|
+
None,
|
|
90
|
+
description="Index of the chunk this statement was extracted from (0-indexed)"
|
|
91
|
+
)
|
|
66
92
|
|
|
67
93
|
def __str__(self) -> str:
|
|
68
94
|
return f"{self.subject.text} --[{self.predicate}]--> {self.object.text}"
|
|
@@ -16,10 +16,10 @@ class PipelineConfig(BaseModel):
|
|
|
16
16
|
Controls which stages are enabled, which plugins to use,
|
|
17
17
|
and stage-specific options.
|
|
18
18
|
"""
|
|
19
|
-
# Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=
|
|
19
|
+
# Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=Labeling, 5=Taxonomy)
|
|
20
20
|
enabled_stages: set[int] = Field(
|
|
21
|
-
default={1, 2, 3, 4, 5
|
|
22
|
-
description="Set of enabled stage numbers (1-
|
|
21
|
+
default={1, 2, 3, 4, 5},
|
|
22
|
+
description="Set of enabled stage numbers (1-5)"
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
# Plugin selection
|
|
@@ -45,11 +45,7 @@ class PipelineConfig(BaseModel):
|
|
|
45
45
|
)
|
|
46
46
|
qualifier_options: dict[str, Any] = Field(
|
|
47
47
|
default_factory=dict,
|
|
48
|
-
description="Options passed to qualifier plugins"
|
|
49
|
-
)
|
|
50
|
-
canonicalizer_options: dict[str, Any] = Field(
|
|
51
|
-
default_factory=dict,
|
|
52
|
-
description="Options passed to canonicalizer plugins"
|
|
48
|
+
description="Options passed to qualifier plugins (includes canonicalizers)"
|
|
53
49
|
)
|
|
54
50
|
labeler_options: dict[str, Any] = Field(
|
|
55
51
|
default_factory=dict,
|
|
@@ -123,9 +119,8 @@ STAGE_NAMES = {
|
|
|
123
119
|
1: "splitting",
|
|
124
120
|
2: "extraction",
|
|
125
121
|
3: "qualification",
|
|
126
|
-
4: "
|
|
127
|
-
5: "
|
|
128
|
-
6: "taxonomy",
|
|
122
|
+
4: "labeling",
|
|
123
|
+
5: "taxonomy",
|
|
129
124
|
}
|
|
130
125
|
|
|
131
126
|
|
|
@@ -4,9 +4,9 @@ ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
|
|
|
4
4
|
Coordinates the flow of data through all pipeline stages:
|
|
5
5
|
1. Splitting: Text → RawTriple
|
|
6
6
|
2. Extraction: RawTriple → PipelineStatement
|
|
7
|
-
3. Qualification: Entity →
|
|
8
|
-
4.
|
|
9
|
-
5.
|
|
7
|
+
3. Qualification: Entity → CanonicalEntity
|
|
8
|
+
4. Labeling: Statement → LabeledStatement
|
|
9
|
+
5. Taxonomy: Statement → TaxonomyResult
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import logging
|
|
@@ -18,7 +18,6 @@ from .config import PipelineConfig, get_stage_name
|
|
|
18
18
|
from .registry import PluginRegistry
|
|
19
19
|
from ..models import (
|
|
20
20
|
QualifiedEntity,
|
|
21
|
-
EntityQualifiers,
|
|
22
21
|
CanonicalEntity,
|
|
23
22
|
LabeledStatement,
|
|
24
23
|
TaxonomyResult,
|
|
@@ -31,8 +30,12 @@ class ExtractionPipeline:
|
|
|
31
30
|
"""
|
|
32
31
|
Main pipeline orchestrator.
|
|
33
32
|
|
|
34
|
-
Coordinates the flow of data through all 5 stages
|
|
35
|
-
|
|
33
|
+
Coordinates the flow of data through all 5 stages:
|
|
34
|
+
1. Splitting: Text → RawTriple (using splitter plugins)
|
|
35
|
+
2. Extraction: RawTriple → PipelineStatement (using extractor plugins)
|
|
36
|
+
3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
|
|
37
|
+
4. Labeling: Statement → LabeledStatement (using labeler plugins)
|
|
38
|
+
5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
41
|
def __init__(self, config: Optional[PipelineConfig] = None):
|
|
@@ -86,20 +89,16 @@ class ExtractionPipeline:
|
|
|
86
89
|
if self.config.is_stage_enabled(2):
|
|
87
90
|
ctx = self._run_extraction(ctx)
|
|
88
91
|
|
|
89
|
-
# Stage 3: Qualification
|
|
92
|
+
# Stage 3: Qualification (runs qualifiers + canonicalizers)
|
|
90
93
|
if self.config.is_stage_enabled(3):
|
|
91
94
|
ctx = self._run_qualification(ctx)
|
|
92
95
|
|
|
93
|
-
# Stage 4:
|
|
96
|
+
# Stage 4: Labeling
|
|
94
97
|
if self.config.is_stage_enabled(4):
|
|
95
|
-
ctx = self._run_canonicalization(ctx)
|
|
96
|
-
|
|
97
|
-
# Stage 5: Labeling
|
|
98
|
-
if self.config.is_stage_enabled(5):
|
|
99
98
|
ctx = self._run_labeling(ctx)
|
|
100
99
|
|
|
101
|
-
# Stage
|
|
102
|
-
if self.config.is_stage_enabled(
|
|
100
|
+
# Stage 5: Taxonomy classification
|
|
101
|
+
if self.config.is_stage_enabled(5):
|
|
103
102
|
ctx = self._run_taxonomy(ctx)
|
|
104
103
|
|
|
105
104
|
except Exception as e:
|
|
@@ -211,7 +210,12 @@ class ExtractionPipeline:
|
|
|
211
210
|
return schemas
|
|
212
211
|
|
|
213
212
|
def _run_qualification(self, ctx: PipelineContext) -> PipelineContext:
|
|
214
|
-
"""
|
|
213
|
+
"""
|
|
214
|
+
Stage 3: Qualify entities with identifiers, canonical names, and FQNs.
|
|
215
|
+
|
|
216
|
+
Runs qualifier plugins for each entity type. Qualifier plugins now return
|
|
217
|
+
CanonicalEntity directly (with qualifiers, canonical match, and FQN).
|
|
218
|
+
"""
|
|
215
219
|
stage_name = get_stage_name(3)
|
|
216
220
|
logger.debug(f"Running {stage_name} stage")
|
|
217
221
|
start_time = time.time()
|
|
@@ -227,14 +231,15 @@ class ExtractionPipeline:
|
|
|
227
231
|
if entity.entity_ref not in entities_to_qualify:
|
|
228
232
|
entities_to_qualify[entity.entity_ref] = entity
|
|
229
233
|
|
|
230
|
-
logger.
|
|
234
|
+
logger.info(f"Stage 3: Qualifying {len(entities_to_qualify)} unique entities")
|
|
231
235
|
|
|
232
|
-
#
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
+
# Process each entity through qualifier plugins
|
|
237
|
+
entities_list = list(entities_to_qualify.items())
|
|
238
|
+
for idx, (entity_ref, entity) in enumerate(entities_list, 1):
|
|
239
|
+
logger.info(f" [{idx}/{len(entities_list)}] Qualifying '{entity.text}' ({entity.type.value})")
|
|
236
240
|
|
|
237
|
-
#
|
|
241
|
+
# Run qualifier plugins - first one to return a result wins
|
|
242
|
+
canonical = None
|
|
238
243
|
type_qualifiers = PluginRegistry.get_qualifiers_for_type(entity.type)
|
|
239
244
|
|
|
240
245
|
for qualifier_plugin in type_qualifiers:
|
|
@@ -242,86 +247,36 @@ class ExtractionPipeline:
|
|
|
242
247
|
continue
|
|
243
248
|
|
|
244
249
|
try:
|
|
245
|
-
|
|
246
|
-
if
|
|
247
|
-
|
|
248
|
-
|
|
250
|
+
result = qualifier_plugin.qualify(entity, ctx)
|
|
251
|
+
if result is not None:
|
|
252
|
+
canonical = result
|
|
253
|
+
logger.info(f" Qualified by {qualifier_plugin.name}: {canonical.fqn}")
|
|
254
|
+
break # Use first successful match
|
|
249
255
|
except Exception as e:
|
|
250
256
|
logger.error(f"Qualifier {qualifier_plugin.name} failed for {entity.text}: {e}")
|
|
251
257
|
ctx.add_error(f"Qualifier {qualifier_plugin.name} failed: {str(e)}")
|
|
252
258
|
if self.config.fail_fast:
|
|
253
259
|
raise
|
|
254
260
|
|
|
255
|
-
# Create
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
logger.info(f"Qualified {len(ctx.qualified_entities)} entities")
|
|
266
|
-
ctx.record_timing(stage_name, time.time() - start_time)
|
|
267
|
-
return ctx
|
|
268
|
-
|
|
269
|
-
def _run_canonicalization(self, ctx: PipelineContext) -> PipelineContext:
|
|
270
|
-
"""Stage 4: Resolve entities to canonical forms."""
|
|
271
|
-
stage_name = get_stage_name(4)
|
|
272
|
-
logger.debug(f"Running {stage_name} stage")
|
|
273
|
-
start_time = time.time()
|
|
274
|
-
|
|
275
|
-
if not ctx.qualified_entities:
|
|
276
|
-
# Create basic qualified entities if stage 3 was skipped
|
|
277
|
-
for stmt in ctx.statements:
|
|
278
|
-
for entity in [stmt.subject, stmt.object]:
|
|
279
|
-
if entity.entity_ref not in ctx.qualified_entities:
|
|
280
|
-
ctx.qualified_entities[entity.entity_ref] = QualifiedEntity(
|
|
281
|
-
entity_ref=entity.entity_ref,
|
|
282
|
-
original_text=entity.text,
|
|
283
|
-
entity_type=entity.type,
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
# Canonicalize each qualified entity
|
|
287
|
-
for entity_ref, qualified in ctx.qualified_entities.items():
|
|
288
|
-
canonical_match = None
|
|
289
|
-
fqn = None
|
|
290
|
-
|
|
291
|
-
# Get canonicalizers for this entity type
|
|
292
|
-
type_canonicalizers = PluginRegistry.get_canonicalizers_for_type(qualified.entity_type)
|
|
293
|
-
|
|
294
|
-
for canon_plugin in type_canonicalizers:
|
|
295
|
-
if not self.config.is_plugin_enabled(canon_plugin.name):
|
|
296
|
-
continue
|
|
297
|
-
|
|
298
|
-
try:
|
|
299
|
-
match = canon_plugin.find_canonical(qualified, ctx)
|
|
300
|
-
if match:
|
|
301
|
-
canonical_match = match
|
|
302
|
-
fqn = canon_plugin.format_fqn(qualified, match)
|
|
303
|
-
break # Use first successful match
|
|
304
|
-
except Exception as e:
|
|
305
|
-
logger.error(f"Canonicalizer {canon_plugin.name} failed for {qualified.original_text}: {e}")
|
|
306
|
-
ctx.add_error(f"Canonicalizer {canon_plugin.name} failed: {str(e)}")
|
|
307
|
-
if self.config.fail_fast:
|
|
308
|
-
raise
|
|
261
|
+
# Create fallback CanonicalEntity if no plugin matched
|
|
262
|
+
if canonical is None:
|
|
263
|
+
qualified = QualifiedEntity(
|
|
264
|
+
entity_ref=entity_ref,
|
|
265
|
+
original_text=entity.text,
|
|
266
|
+
entity_type=entity.type,
|
|
267
|
+
)
|
|
268
|
+
canonical = CanonicalEntity.from_qualified(qualified=qualified)
|
|
269
|
+
logger.debug(f" No qualification found, using original text")
|
|
309
270
|
|
|
310
|
-
# Create CanonicalEntity
|
|
311
|
-
canonical = CanonicalEntity.from_qualified(
|
|
312
|
-
qualified=qualified,
|
|
313
|
-
canonical_match=canonical_match,
|
|
314
|
-
fqn=fqn,
|
|
315
|
-
)
|
|
316
271
|
ctx.canonical_entities[entity_ref] = canonical
|
|
317
272
|
|
|
318
|
-
logger.info(f"
|
|
273
|
+
logger.info(f"Qualified {len(ctx.canonical_entities)} entities")
|
|
319
274
|
ctx.record_timing(stage_name, time.time() - start_time)
|
|
320
275
|
return ctx
|
|
321
276
|
|
|
322
277
|
def _run_labeling(self, ctx: PipelineContext) -> PipelineContext:
|
|
323
|
-
"""Stage
|
|
324
|
-
stage_name = get_stage_name(
|
|
278
|
+
"""Stage 4: Apply labels to statements."""
|
|
279
|
+
stage_name = get_stage_name(4)
|
|
325
280
|
logger.debug(f"Running {stage_name} stage")
|
|
326
281
|
start_time = time.time()
|
|
327
282
|
|
|
@@ -329,9 +284,9 @@ class ExtractionPipeline:
|
|
|
329
284
|
logger.debug("No statements to label")
|
|
330
285
|
return ctx
|
|
331
286
|
|
|
332
|
-
# Ensure canonical entities exist
|
|
287
|
+
# Ensure canonical entities exist (run qualification if skipped)
|
|
333
288
|
if not ctx.canonical_entities:
|
|
334
|
-
self.
|
|
289
|
+
self._run_qualification(ctx)
|
|
335
290
|
|
|
336
291
|
labelers = PluginRegistry.get_labelers()
|
|
337
292
|
|
|
@@ -393,8 +348,10 @@ class ExtractionPipeline:
|
|
|
393
348
|
return ctx
|
|
394
349
|
|
|
395
350
|
def _run_taxonomy(self, ctx: PipelineContext) -> PipelineContext:
|
|
396
|
-
"""Stage
|
|
397
|
-
|
|
351
|
+
"""Stage 5: Classify statements against taxonomies."""
|
|
352
|
+
from ..plugins.base import PluginCapability
|
|
353
|
+
|
|
354
|
+
stage_name = get_stage_name(5)
|
|
398
355
|
logger.debug(f"Running {stage_name} stage")
|
|
399
356
|
start_time = time.time()
|
|
400
357
|
|
|
@@ -408,27 +365,38 @@ class ExtractionPipeline:
|
|
|
408
365
|
return ctx
|
|
409
366
|
|
|
410
367
|
total_results = 0
|
|
411
|
-
for labeled_stmt in ctx.labeled_statements:
|
|
412
|
-
stmt = labeled_stmt.statement
|
|
413
|
-
subj_canonical = labeled_stmt.subject_canonical
|
|
414
|
-
obj_canonical = labeled_stmt.object_canonical
|
|
415
|
-
|
|
416
|
-
# Apply all taxonomy classifiers
|
|
417
|
-
for classifier in taxonomy_classifiers:
|
|
418
|
-
if not self.config.is_plugin_enabled(classifier.name):
|
|
419
|
-
continue
|
|
420
368
|
|
|
421
|
-
|
|
422
|
-
|
|
369
|
+
# Prepare batch items: list of (statement, subject_canonical, object_canonical)
|
|
370
|
+
batch_items = [
|
|
371
|
+
(labeled_stmt.statement, labeled_stmt.subject_canonical, labeled_stmt.object_canonical)
|
|
372
|
+
for labeled_stmt in ctx.labeled_statements
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
# Apply all taxonomy classifiers
|
|
376
|
+
for classifier in taxonomy_classifiers:
|
|
377
|
+
if not self.config.is_plugin_enabled(classifier.name):
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
# Require batch processing capability
|
|
382
|
+
if PluginCapability.BATCH_PROCESSING not in classifier.capabilities:
|
|
383
|
+
raise RuntimeError(
|
|
384
|
+
f"Taxonomy classifier '{classifier.name}' does not support batch processing. "
|
|
385
|
+
"Pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
logger.debug(f"Using batch classification for {classifier.name} ({len(batch_items)} items)")
|
|
389
|
+
batch_results = classifier.classify_batch(batch_items, ctx)
|
|
390
|
+
|
|
391
|
+
# Apply results to each labeled statement
|
|
392
|
+
for labeled_stmt, results in zip(ctx.labeled_statements, batch_results):
|
|
423
393
|
if results:
|
|
424
|
-
|
|
394
|
+
stmt = labeled_stmt.statement
|
|
425
395
|
key = (stmt.source_text, classifier.taxonomy_name)
|
|
426
396
|
if key not in ctx.taxonomy_results:
|
|
427
397
|
ctx.taxonomy_results[key] = []
|
|
428
398
|
ctx.taxonomy_results[key].extend(results)
|
|
429
399
|
total_results += len(results)
|
|
430
|
-
|
|
431
|
-
# Also add to the labeled statement for easy access
|
|
432
400
|
labeled_stmt.taxonomy_results.extend(results)
|
|
433
401
|
|
|
434
402
|
for result in results:
|
|
@@ -436,11 +404,12 @@ class ExtractionPipeline:
|
|
|
436
404
|
f"Taxonomy {classifier.name}: {result.full_label} "
|
|
437
405
|
f"(confidence={result.confidence:.2f})"
|
|
438
406
|
)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
407
|
+
|
|
408
|
+
except Exception as e:
|
|
409
|
+
logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
|
|
410
|
+
ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
|
|
411
|
+
if self.config.fail_fast:
|
|
412
|
+
raise
|
|
444
413
|
|
|
445
414
|
logger.info(f"Taxonomy produced {total_results} labels across {len(ctx.taxonomy_results)} statement-taxonomy pairs")
|
|
446
415
|
ctx.record_timing(stage_name, time.time() - start_time)
|
|
@@ -14,9 +14,10 @@ if TYPE_CHECKING:
|
|
|
14
14
|
BaseSplitterPlugin,
|
|
15
15
|
BaseExtractorPlugin,
|
|
16
16
|
BaseQualifierPlugin,
|
|
17
|
-
BaseCanonicalizerPlugin,
|
|
18
17
|
BaseLabelerPlugin,
|
|
19
18
|
BaseTaxonomyPlugin,
|
|
19
|
+
BaseScraperPlugin,
|
|
20
|
+
BasePDFParserPlugin,
|
|
20
21
|
)
|
|
21
22
|
from ..models import EntityType
|
|
22
23
|
|
|
@@ -37,13 +38,15 @@ class PluginRegistry:
|
|
|
37
38
|
_splitters: list["BaseSplitterPlugin"] = []
|
|
38
39
|
_extractors: list["BaseExtractorPlugin"] = []
|
|
39
40
|
_qualifiers: list["BaseQualifierPlugin"] = []
|
|
40
|
-
_canonicalizers: list["BaseCanonicalizerPlugin"] = []
|
|
41
41
|
_labelers: list["BaseLabelerPlugin"] = []
|
|
42
42
|
_taxonomy_classifiers: list["BaseTaxonomyPlugin"] = []
|
|
43
43
|
|
|
44
|
+
# Content acquisition plugins
|
|
45
|
+
_scrapers: list["BaseScraperPlugin"] = []
|
|
46
|
+
_pdf_parsers: list["BasePDFParserPlugin"] = []
|
|
47
|
+
|
|
44
48
|
# Index by entity type for quick lookup
|
|
45
49
|
_qualifiers_by_type: dict["EntityType", list["BaseQualifierPlugin"]] = {}
|
|
46
|
-
_canonicalizers_by_type: dict["EntityType", list["BaseCanonicalizerPlugin"]] = {}
|
|
47
50
|
|
|
48
51
|
# Index by name for CLI lookup
|
|
49
52
|
_all_plugins: dict[str, "BasePlugin"] = {}
|
|
@@ -54,11 +57,11 @@ class PluginRegistry:
|
|
|
54
57
|
cls._splitters = []
|
|
55
58
|
cls._extractors = []
|
|
56
59
|
cls._qualifiers = []
|
|
57
|
-
cls._canonicalizers = []
|
|
58
60
|
cls._labelers = []
|
|
59
61
|
cls._taxonomy_classifiers = []
|
|
62
|
+
cls._scrapers = []
|
|
63
|
+
cls._pdf_parsers = []
|
|
60
64
|
cls._qualifiers_by_type = {}
|
|
61
|
-
cls._canonicalizers_by_type = {}
|
|
62
65
|
cls._all_plugins = {}
|
|
63
66
|
|
|
64
67
|
# =========================================================================
|
|
@@ -100,25 +103,6 @@ class PluginRegistry:
|
|
|
100
103
|
f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
|
|
101
104
|
)
|
|
102
105
|
|
|
103
|
-
@classmethod
|
|
104
|
-
def register_canonicalizer(cls, plugin: "BaseCanonicalizerPlugin") -> None:
|
|
105
|
-
"""Register a canonicalizer plugin."""
|
|
106
|
-
cls._canonicalizers.append(plugin)
|
|
107
|
-
cls._canonicalizers.sort(key=lambda p: p.priority)
|
|
108
|
-
cls._all_plugins[plugin.name] = plugin
|
|
109
|
-
|
|
110
|
-
# Index by entity type
|
|
111
|
-
for entity_type in plugin.supported_entity_types:
|
|
112
|
-
if entity_type not in cls._canonicalizers_by_type:
|
|
113
|
-
cls._canonicalizers_by_type[entity_type] = []
|
|
114
|
-
cls._canonicalizers_by_type[entity_type].append(plugin)
|
|
115
|
-
cls._canonicalizers_by_type[entity_type].sort(key=lambda p: p.priority)
|
|
116
|
-
|
|
117
|
-
logger.debug(
|
|
118
|
-
f"Registered canonicalizer: {plugin.name} "
|
|
119
|
-
f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
|
|
120
|
-
)
|
|
121
|
-
|
|
122
106
|
@classmethod
|
|
123
107
|
def register_labeler(cls, plugin: "BaseLabelerPlugin") -> None:
|
|
124
108
|
"""Register a labeler plugin."""
|
|
@@ -135,6 +119,22 @@ class PluginRegistry:
|
|
|
135
119
|
cls._all_plugins[plugin.name] = plugin
|
|
136
120
|
logger.debug(f"Registered taxonomy: {plugin.name} (priority={plugin.priority})")
|
|
137
121
|
|
|
122
|
+
@classmethod
|
|
123
|
+
def register_scraper(cls, plugin: "BaseScraperPlugin") -> None:
|
|
124
|
+
"""Register a scraper plugin."""
|
|
125
|
+
cls._scrapers.append(plugin)
|
|
126
|
+
cls._scrapers.sort(key=lambda p: p.priority)
|
|
127
|
+
cls._all_plugins[plugin.name] = plugin
|
|
128
|
+
logger.debug(f"Registered scraper: {plugin.name} (priority={plugin.priority})")
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def register_pdf_parser(cls, plugin: "BasePDFParserPlugin") -> None:
|
|
132
|
+
"""Register a PDF parser plugin."""
|
|
133
|
+
cls._pdf_parsers.append(plugin)
|
|
134
|
+
cls._pdf_parsers.sort(key=lambda p: p.priority)
|
|
135
|
+
cls._all_plugins[plugin.name] = plugin
|
|
136
|
+
logger.debug(f"Registered PDF parser: {plugin.name} (priority={plugin.priority})")
|
|
137
|
+
|
|
138
138
|
# =========================================================================
|
|
139
139
|
# Decorator registration
|
|
140
140
|
# =========================================================================
|
|
@@ -157,12 +157,6 @@ class PluginRegistry:
|
|
|
157
157
|
cls.register_qualifier(plugin_class())
|
|
158
158
|
return plugin_class
|
|
159
159
|
|
|
160
|
-
@classmethod
|
|
161
|
-
def canonicalizer(cls, plugin_class: Type[T]) -> Type[T]:
|
|
162
|
-
"""Decorator to register a canonicalizer plugin class."""
|
|
163
|
-
cls.register_canonicalizer(plugin_class())
|
|
164
|
-
return plugin_class
|
|
165
|
-
|
|
166
160
|
@classmethod
|
|
167
161
|
def labeler(cls, plugin_class: Type[T]) -> Type[T]:
|
|
168
162
|
"""Decorator to register a labeler plugin class."""
|
|
@@ -175,6 +169,18 @@ class PluginRegistry:
|
|
|
175
169
|
cls.register_taxonomy(plugin_class())
|
|
176
170
|
return plugin_class
|
|
177
171
|
|
|
172
|
+
@classmethod
|
|
173
|
+
def scraper(cls, plugin_class: Type[T]) -> Type[T]:
|
|
174
|
+
"""Decorator to register a scraper plugin class."""
|
|
175
|
+
cls.register_scraper(plugin_class())
|
|
176
|
+
return plugin_class
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def pdf_parser(cls, plugin_class: Type[T]) -> Type[T]:
|
|
180
|
+
"""Decorator to register a PDF parser plugin class."""
|
|
181
|
+
cls.register_pdf_parser(plugin_class())
|
|
182
|
+
return plugin_class
|
|
183
|
+
|
|
178
184
|
# =========================================================================
|
|
179
185
|
# Retrieval methods
|
|
180
186
|
# =========================================================================
|
|
@@ -199,16 +205,6 @@ class PluginRegistry:
|
|
|
199
205
|
"""Get qualifier plugins that support a specific entity type."""
|
|
200
206
|
return cls._qualifiers_by_type.get(entity_type, []).copy()
|
|
201
207
|
|
|
202
|
-
@classmethod
|
|
203
|
-
def get_canonicalizers(cls) -> list["BaseCanonicalizerPlugin"]:
|
|
204
|
-
"""Get all registered canonicalizer plugins (sorted by priority)."""
|
|
205
|
-
return cls._canonicalizers.copy()
|
|
206
|
-
|
|
207
|
-
@classmethod
|
|
208
|
-
def get_canonicalizers_for_type(cls, entity_type: "EntityType") -> list["BaseCanonicalizerPlugin"]:
|
|
209
|
-
"""Get canonicalizer plugins that support a specific entity type."""
|
|
210
|
-
return cls._canonicalizers_by_type.get(entity_type, []).copy()
|
|
211
|
-
|
|
212
208
|
@classmethod
|
|
213
209
|
def get_labelers(cls) -> list["BaseLabelerPlugin"]:
|
|
214
210
|
"""Get all registered labeler plugins (sorted by priority)."""
|
|
@@ -219,6 +215,16 @@ class PluginRegistry:
|
|
|
219
215
|
"""Get all registered taxonomy classifier plugins (sorted by priority)."""
|
|
220
216
|
return cls._taxonomy_classifiers.copy()
|
|
221
217
|
|
|
218
|
+
@classmethod
|
|
219
|
+
def get_scrapers(cls) -> list["BaseScraperPlugin"]:
|
|
220
|
+
"""Get all registered scraper plugins (sorted by priority)."""
|
|
221
|
+
return cls._scrapers.copy()
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def get_pdf_parsers(cls) -> list["BasePDFParserPlugin"]:
|
|
225
|
+
"""Get all registered PDF parser plugins (sorted by priority)."""
|
|
226
|
+
return cls._pdf_parsers.copy()
|
|
227
|
+
|
|
222
228
|
@classmethod
|
|
223
229
|
def get_plugin(cls, name: str) -> "BasePlugin | None":
|
|
224
230
|
"""Get a plugin by name."""
|
|
@@ -239,10 +245,8 @@ class PluginRegistry:
|
|
|
239
245
|
elif stage == 3:
|
|
240
246
|
return cls._qualifiers.copy()
|
|
241
247
|
elif stage == 4:
|
|
242
|
-
return cls._canonicalizers.copy()
|
|
243
|
-
elif stage == 5:
|
|
244
248
|
return cls._labelers.copy()
|
|
245
|
-
elif stage ==
|
|
249
|
+
elif stage == 5:
|
|
246
250
|
return cls._taxonomy_classifiers.copy()
|
|
247
251
|
return []
|
|
248
252
|
|
|
@@ -267,9 +271,11 @@ class PluginRegistry:
|
|
|
267
271
|
(1, "splitting", cls._splitters),
|
|
268
272
|
(2, "extraction", cls._extractors),
|
|
269
273
|
(3, "qualification", cls._qualifiers),
|
|
270
|
-
(4, "
|
|
271
|
-
(5, "
|
|
272
|
-
(
|
|
274
|
+
(4, "labeling", cls._labelers),
|
|
275
|
+
(5, "taxonomy", cls._taxonomy_classifiers),
|
|
276
|
+
# Content acquisition plugins (stage 0)
|
|
277
|
+
(0, "scraper", cls._scrapers),
|
|
278
|
+
(-1, "pdf_parser", cls._pdf_parsers),
|
|
273
279
|
]
|
|
274
280
|
|
|
275
281
|
for stage_num, stage_name, plugins in plugins_by_stage:
|