corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -3,15 +3,46 @@ Qualifier models for the extraction pipeline.
3
3
 
4
4
  EntityQualifiers: Semantic qualifiers and external identifiers
5
5
  QualifiedEntity: Entity with qualification information from Stage 3
6
+ ResolvedRole: Canonical role information from database
7
+ ResolvedOrganization: Canonical organization information from database
6
8
  """
7
9
 
8
- from typing import Optional
10
+ from typing import Any, Optional
9
11
 
10
12
  from pydantic import BaseModel, Field
11
13
 
12
14
  from .entity import EntityType
13
15
 
14
16
 
17
+ class ResolvedRole(BaseModel):
18
+ """
19
+ Resolved/canonical role information for a person.
20
+
21
+ Populated when matching a person against the database,
22
+ capturing the canonical role from Wikidata or other sources.
23
+ """
24
+ canonical_name: str = Field(..., description="Canonical role name (e.g., 'Chief Executive Officer')")
25
+ canonical_id: Optional[str] = Field(None, description="Full canonical ID (e.g., 'wikidata:Q484876')")
26
+ source: str = Field(..., description="Source of resolution (e.g., 'wikidata')")
27
+ source_id: Optional[str] = Field(None, description="ID in the source (e.g., 'Q484876' for Wikidata)")
28
+
29
+
30
+ class ResolvedOrganization(BaseModel):
31
+ """
32
+ Resolved/canonical organization information.
33
+
34
+ Populated when resolving an organization mentioned in context
35
+ against the organization database (GLEIF, SEC, Companies House, Wikidata).
36
+ """
37
+ canonical_name: str = Field(..., description="Canonical organization name")
38
+ canonical_id: str = Field(..., description="Full canonical ID (e.g., 'LEI:549300XYZ', 'SEC-CIK:1234567')")
39
+ source: str = Field(..., description="Source of resolution (e.g., 'gleif', 'sec_edgar', 'wikidata')")
40
+ source_id: str = Field(..., description="ID in the source")
41
+ region: Optional[str] = Field(None, description="Organization's region/jurisdiction")
42
+ match_confidence: float = Field(default=1.0, description="Confidence in the match (0-1)")
43
+ match_details: Optional[dict[str, Any]] = Field(None, description="Additional match details")
44
+
45
+
15
46
  class EntityQualifiers(BaseModel):
16
47
  """
17
48
  Qualifiers that provide context and identifiers for an entity.
@@ -22,6 +53,9 @@ class EntityQualifiers(BaseModel):
22
53
  - CompaniesHouseQualifierPlugin: Adds UK company number
23
54
  - SECEdgarQualifierPlugin: Adds SEC CIK, ticker
24
55
  """
56
+ # Canonical name from database (for ORG entities)
57
+ legal_name: Optional[str] = Field(None, description="Canonical legal name from database")
58
+
25
59
  # Semantic qualifiers (for PERSON entities)
26
60
  org: Optional[str] = Field(None, description="Organization/employer name")
27
61
  role: Optional[str] = Field(None, description="Job title/position/role")
@@ -38,11 +72,22 @@ class EntityQualifiers(BaseModel):
38
72
  description="External identifiers: lei, ch_number, sec_cik, ticker, wikidata_qid, etc."
39
73
  )
40
74
 
75
+ # Resolved canonical information (for PERSON entities)
76
+ resolved_role: Optional[ResolvedRole] = Field(
77
+ None,
78
+ description="Canonical role information from database lookup"
79
+ )
80
+ resolved_org: Optional[ResolvedOrganization] = Field(
81
+ None,
82
+ description="Canonical organization information from database lookup"
83
+ )
84
+
41
85
  def has_any_qualifier(self) -> bool:
42
86
  """Check if any qualifier or identifier is set."""
43
87
  return bool(
44
- self.org or self.role or self.region or self.country or
45
- self.city or self.jurisdiction or self.identifiers
88
+ self.legal_name or self.org or self.role or self.region or self.country or
89
+ self.city or self.jurisdiction or self.identifiers or
90
+ self.resolved_role or self.resolved_org
46
91
  )
47
92
 
48
93
  def merge_with(self, other: "EntityQualifiers") -> "EntityQualifiers":
@@ -53,6 +98,7 @@ class EntityQualifiers(BaseModel):
53
98
  """
54
99
  merged_identifiers = {**self.identifiers, **other.identifiers}
55
100
  return EntityQualifiers(
101
+ legal_name=other.legal_name or self.legal_name,
56
102
  org=other.org or self.org,
57
103
  role=other.role or self.role,
58
104
  region=other.region or self.region,
@@ -60,6 +106,8 @@ class EntityQualifiers(BaseModel):
60
106
  city=other.city or self.city,
61
107
  jurisdiction=other.jurisdiction or self.jurisdiction,
62
108
  identifiers=merged_identifiers,
109
+ resolved_role=other.resolved_role or self.resolved_role,
110
+ resolved_org=other.resolved_org or self.resolved_org,
63
111
  )
64
112
 
65
113
 
@@ -29,6 +29,19 @@ class RawTriple(BaseModel):
29
29
  le=1.0,
30
30
  description="Extraction confidence from the splitter"
31
31
  )
32
+ # Document tracking fields
33
+ document_id: Optional[str] = Field(
34
+ None,
35
+ description="ID of the source document (for document pipeline)"
36
+ )
37
+ page_number: Optional[int] = Field(
38
+ None,
39
+ description="Page number where this triple was extracted (1-indexed)"
40
+ )
41
+ chunk_index: Optional[int] = Field(
42
+ None,
43
+ description="Index of the chunk this triple was extracted from (0-indexed)"
44
+ )
32
45
 
33
46
  def __str__(self) -> str:
34
47
  return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
@@ -63,6 +76,19 @@ class PipelineStatement(BaseModel):
63
76
  None,
64
77
  description="Method used to extract this statement (e.g., 'hybrid', 'gliner', 'model')"
65
78
  )
79
+ # Document tracking fields
80
+ document_id: Optional[str] = Field(
81
+ None,
82
+ description="ID of the source document (for document pipeline)"
83
+ )
84
+ page_number: Optional[int] = Field(
85
+ None,
86
+ description="Page number where this statement was extracted (1-indexed)"
87
+ )
88
+ chunk_index: Optional[int] = Field(
89
+ None,
90
+ description="Index of the chunk this statement was extracted from (0-indexed)"
91
+ )
66
92
 
67
93
  def __str__(self) -> str:
68
94
  return f"{self.subject.text} --[{self.predicate}]--> {self.object.text}"
@@ -16,10 +16,10 @@ class PipelineConfig(BaseModel):
16
16
  Controls which stages are enabled, which plugins to use,
17
17
  and stage-specific options.
18
18
  """
19
- # Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=Canonicalization, 5=Labeling, 6=Taxonomy)
19
+ # Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=Labeling, 5=Taxonomy)
20
20
  enabled_stages: set[int] = Field(
21
- default={1, 2, 3, 4, 5, 6},
22
- description="Set of enabled stage numbers (1-6)"
21
+ default={1, 2, 3, 4, 5},
22
+ description="Set of enabled stage numbers (1-5)"
23
23
  )
24
24
 
25
25
  # Plugin selection
@@ -45,11 +45,7 @@ class PipelineConfig(BaseModel):
45
45
  )
46
46
  qualifier_options: dict[str, Any] = Field(
47
47
  default_factory=dict,
48
- description="Options passed to qualifier plugins"
49
- )
50
- canonicalizer_options: dict[str, Any] = Field(
51
- default_factory=dict,
52
- description="Options passed to canonicalizer plugins"
48
+ description="Options passed to qualifier plugins (includes canonicalizers)"
53
49
  )
54
50
  labeler_options: dict[str, Any] = Field(
55
51
  default_factory=dict,
@@ -123,9 +119,8 @@ STAGE_NAMES = {
123
119
  1: "splitting",
124
120
  2: "extraction",
125
121
  3: "qualification",
126
- 4: "canonicalization",
127
- 5: "labeling",
128
- 6: "taxonomy",
122
+ 4: "labeling",
123
+ 5: "taxonomy",
129
124
  }
130
125
 
131
126
 
@@ -4,9 +4,9 @@ ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
4
4
  Coordinates the flow of data through all pipeline stages:
5
5
  1. Splitting: Text → RawTriple
6
6
  2. Extraction: RawTriple → PipelineStatement
7
- 3. Qualification: Entity → QualifiedEntity
8
- 4. Canonicalization: QualifiedEntityCanonicalEntity
9
- 5. Labeling: Statement → LabeledStatement
7
+ 3. Qualification: Entity → CanonicalEntity
8
+ 4. Labeling: StatementLabeledStatement
9
+ 5. Taxonomy: Statement → TaxonomyResult
10
10
  """
11
11
 
12
12
  import logging
@@ -18,7 +18,6 @@ from .config import PipelineConfig, get_stage_name
18
18
  from .registry import PluginRegistry
19
19
  from ..models import (
20
20
  QualifiedEntity,
21
- EntityQualifiers,
22
21
  CanonicalEntity,
23
22
  LabeledStatement,
24
23
  TaxonomyResult,
@@ -31,8 +30,12 @@ class ExtractionPipeline:
31
30
  """
32
31
  Main pipeline orchestrator.
33
32
 
34
- Coordinates the flow of data through all 5 stages, invoking registered
35
- plugins in priority order and accumulating results in PipelineContext.
33
+ Coordinates the flow of data through all 5 stages:
34
+ 1. Splitting: Text RawTriple (using splitter plugins)
35
+ 2. Extraction: RawTriple → PipelineStatement (using extractor plugins)
36
+ 3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
37
+ 4. Labeling: Statement → LabeledStatement (using labeler plugins)
38
+ 5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
36
39
  """
37
40
 
38
41
  def __init__(self, config: Optional[PipelineConfig] = None):
@@ -86,20 +89,16 @@ class ExtractionPipeline:
86
89
  if self.config.is_stage_enabled(2):
87
90
  ctx = self._run_extraction(ctx)
88
91
 
89
- # Stage 3: Qualification
92
+ # Stage 3: Qualification (runs qualifiers + canonicalizers)
90
93
  if self.config.is_stage_enabled(3):
91
94
  ctx = self._run_qualification(ctx)
92
95
 
93
- # Stage 4: Canonicalization
96
+ # Stage 4: Labeling
94
97
  if self.config.is_stage_enabled(4):
95
- ctx = self._run_canonicalization(ctx)
96
-
97
- # Stage 5: Labeling
98
- if self.config.is_stage_enabled(5):
99
98
  ctx = self._run_labeling(ctx)
100
99
 
101
- # Stage 6: Taxonomy classification
102
- if self.config.is_stage_enabled(6):
100
+ # Stage 5: Taxonomy classification
101
+ if self.config.is_stage_enabled(5):
103
102
  ctx = self._run_taxonomy(ctx)
104
103
 
105
104
  except Exception as e:
@@ -211,7 +210,12 @@ class ExtractionPipeline:
211
210
  return schemas
212
211
 
213
212
  def _run_qualification(self, ctx: PipelineContext) -> PipelineContext:
214
- """Stage 3: Add qualifiers to entities."""
213
+ """
214
+ Stage 3: Qualify entities with identifiers, canonical names, and FQNs.
215
+
216
+ Runs qualifier plugins for each entity type. Qualifier plugins now return
217
+ CanonicalEntity directly (with qualifiers, canonical match, and FQN).
218
+ """
215
219
  stage_name = get_stage_name(3)
216
220
  logger.debug(f"Running {stage_name} stage")
217
221
  start_time = time.time()
@@ -227,14 +231,15 @@ class ExtractionPipeline:
227
231
  if entity.entity_ref not in entities_to_qualify:
228
232
  entities_to_qualify[entity.entity_ref] = entity
229
233
 
230
- logger.debug(f"Qualifying {len(entities_to_qualify)} unique entities")
234
+ logger.info(f"Stage 3: Qualifying {len(entities_to_qualify)} unique entities")
231
235
 
232
- # Qualify each entity using applicable plugins
233
- for entity_ref, entity in entities_to_qualify.items():
234
- qualifiers = EntityQualifiers()
235
- sources = []
236
+ # Process each entity through qualifier plugins
237
+ entities_list = list(entities_to_qualify.items())
238
+ for idx, (entity_ref, entity) in enumerate(entities_list, 1):
239
+ logger.info(f" [{idx}/{len(entities_list)}] Qualifying '{entity.text}' ({entity.type.value})")
236
240
 
237
- # Get qualifiers for this entity type
241
+ # Run qualifier plugins - first one to return a result wins
242
+ canonical = None
238
243
  type_qualifiers = PluginRegistry.get_qualifiers_for_type(entity.type)
239
244
 
240
245
  for qualifier_plugin in type_qualifiers:
@@ -242,86 +247,36 @@ class ExtractionPipeline:
242
247
  continue
243
248
 
244
249
  try:
245
- plugin_qualifiers = qualifier_plugin.qualify(entity, ctx)
246
- if plugin_qualifiers and plugin_qualifiers.has_any_qualifier():
247
- qualifiers = qualifiers.merge_with(plugin_qualifiers)
248
- sources.append(qualifier_plugin.name)
250
+ result = qualifier_plugin.qualify(entity, ctx)
251
+ if result is not None:
252
+ canonical = result
253
+ logger.info(f" Qualified by {qualifier_plugin.name}: {canonical.fqn}")
254
+ break # Use first successful match
249
255
  except Exception as e:
250
256
  logger.error(f"Qualifier {qualifier_plugin.name} failed for {entity.text}: {e}")
251
257
  ctx.add_error(f"Qualifier {qualifier_plugin.name} failed: {str(e)}")
252
258
  if self.config.fail_fast:
253
259
  raise
254
260
 
255
- # Create QualifiedEntity
256
- qualified = QualifiedEntity(
257
- entity_ref=entity_ref,
258
- original_text=entity.text,
259
- entity_type=entity.type,
260
- qualifiers=qualifiers,
261
- qualification_sources=sources,
262
- )
263
- ctx.qualified_entities[entity_ref] = qualified
264
-
265
- logger.info(f"Qualified {len(ctx.qualified_entities)} entities")
266
- ctx.record_timing(stage_name, time.time() - start_time)
267
- return ctx
268
-
269
- def _run_canonicalization(self, ctx: PipelineContext) -> PipelineContext:
270
- """Stage 4: Resolve entities to canonical forms."""
271
- stage_name = get_stage_name(4)
272
- logger.debug(f"Running {stage_name} stage")
273
- start_time = time.time()
274
-
275
- if not ctx.qualified_entities:
276
- # Create basic qualified entities if stage 3 was skipped
277
- for stmt in ctx.statements:
278
- for entity in [stmt.subject, stmt.object]:
279
- if entity.entity_ref not in ctx.qualified_entities:
280
- ctx.qualified_entities[entity.entity_ref] = QualifiedEntity(
281
- entity_ref=entity.entity_ref,
282
- original_text=entity.text,
283
- entity_type=entity.type,
284
- )
285
-
286
- # Canonicalize each qualified entity
287
- for entity_ref, qualified in ctx.qualified_entities.items():
288
- canonical_match = None
289
- fqn = None
290
-
291
- # Get canonicalizers for this entity type
292
- type_canonicalizers = PluginRegistry.get_canonicalizers_for_type(qualified.entity_type)
293
-
294
- for canon_plugin in type_canonicalizers:
295
- if not self.config.is_plugin_enabled(canon_plugin.name):
296
- continue
297
-
298
- try:
299
- match = canon_plugin.find_canonical(qualified, ctx)
300
- if match:
301
- canonical_match = match
302
- fqn = canon_plugin.format_fqn(qualified, match)
303
- break # Use first successful match
304
- except Exception as e:
305
- logger.error(f"Canonicalizer {canon_plugin.name} failed for {qualified.original_text}: {e}")
306
- ctx.add_error(f"Canonicalizer {canon_plugin.name} failed: {str(e)}")
307
- if self.config.fail_fast:
308
- raise
261
+ # Create fallback CanonicalEntity if no plugin matched
262
+ if canonical is None:
263
+ qualified = QualifiedEntity(
264
+ entity_ref=entity_ref,
265
+ original_text=entity.text,
266
+ entity_type=entity.type,
267
+ )
268
+ canonical = CanonicalEntity.from_qualified(qualified=qualified)
269
+ logger.debug(f" No qualification found, using original text")
309
270
 
310
- # Create CanonicalEntity
311
- canonical = CanonicalEntity.from_qualified(
312
- qualified=qualified,
313
- canonical_match=canonical_match,
314
- fqn=fqn,
315
- )
316
271
  ctx.canonical_entities[entity_ref] = canonical
317
272
 
318
- logger.info(f"Canonicalized {len(ctx.canonical_entities)} entities")
273
+ logger.info(f"Qualified {len(ctx.canonical_entities)} entities")
319
274
  ctx.record_timing(stage_name, time.time() - start_time)
320
275
  return ctx
321
276
 
322
277
  def _run_labeling(self, ctx: PipelineContext) -> PipelineContext:
323
- """Stage 5: Apply labels to statements."""
324
- stage_name = get_stage_name(5)
278
+ """Stage 4: Apply labels to statements."""
279
+ stage_name = get_stage_name(4)
325
280
  logger.debug(f"Running {stage_name} stage")
326
281
  start_time = time.time()
327
282
 
@@ -329,9 +284,9 @@ class ExtractionPipeline:
329
284
  logger.debug("No statements to label")
330
285
  return ctx
331
286
 
332
- # Ensure canonical entities exist
287
+ # Ensure canonical entities exist (run qualification if skipped)
333
288
  if not ctx.canonical_entities:
334
- self._run_canonicalization(ctx)
289
+ self._run_qualification(ctx)
335
290
 
336
291
  labelers = PluginRegistry.get_labelers()
337
292
 
@@ -393,8 +348,10 @@ class ExtractionPipeline:
393
348
  return ctx
394
349
 
395
350
  def _run_taxonomy(self, ctx: PipelineContext) -> PipelineContext:
396
- """Stage 6: Classify statements against taxonomies."""
397
- stage_name = get_stage_name(6)
351
+ """Stage 5: Classify statements against taxonomies."""
352
+ from ..plugins.base import PluginCapability
353
+
354
+ stage_name = get_stage_name(5)
398
355
  logger.debug(f"Running {stage_name} stage")
399
356
  start_time = time.time()
400
357
 
@@ -408,27 +365,38 @@ class ExtractionPipeline:
408
365
  return ctx
409
366
 
410
367
  total_results = 0
411
- for labeled_stmt in ctx.labeled_statements:
412
- stmt = labeled_stmt.statement
413
- subj_canonical = labeled_stmt.subject_canonical
414
- obj_canonical = labeled_stmt.object_canonical
415
-
416
- # Apply all taxonomy classifiers
417
- for classifier in taxonomy_classifiers:
418
- if not self.config.is_plugin_enabled(classifier.name):
419
- continue
420
368
 
421
- try:
422
- results = classifier.classify(stmt, subj_canonical, obj_canonical, ctx)
369
+ # Prepare batch items: list of (statement, subject_canonical, object_canonical)
370
+ batch_items = [
371
+ (labeled_stmt.statement, labeled_stmt.subject_canonical, labeled_stmt.object_canonical)
372
+ for labeled_stmt in ctx.labeled_statements
373
+ ]
374
+
375
+ # Apply all taxonomy classifiers
376
+ for classifier in taxonomy_classifiers:
377
+ if not self.config.is_plugin_enabled(classifier.name):
378
+ continue
379
+
380
+ try:
381
+ # Require batch processing capability
382
+ if PluginCapability.BATCH_PROCESSING not in classifier.capabilities:
383
+ raise RuntimeError(
384
+ f"Taxonomy classifier '{classifier.name}' does not support batch processing. "
385
+ "Pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
386
+ )
387
+
388
+ logger.debug(f"Using batch classification for {classifier.name} ({len(batch_items)} items)")
389
+ batch_results = classifier.classify_batch(batch_items, ctx)
390
+
391
+ # Apply results to each labeled statement
392
+ for labeled_stmt, results in zip(ctx.labeled_statements, batch_results):
423
393
  if results:
424
- # Store taxonomy results in context (list of results per key)
394
+ stmt = labeled_stmt.statement
425
395
  key = (stmt.source_text, classifier.taxonomy_name)
426
396
  if key not in ctx.taxonomy_results:
427
397
  ctx.taxonomy_results[key] = []
428
398
  ctx.taxonomy_results[key].extend(results)
429
399
  total_results += len(results)
430
-
431
- # Also add to the labeled statement for easy access
432
400
  labeled_stmt.taxonomy_results.extend(results)
433
401
 
434
402
  for result in results:
@@ -436,11 +404,12 @@ class ExtractionPipeline:
436
404
  f"Taxonomy {classifier.name}: {result.full_label} "
437
405
  f"(confidence={result.confidence:.2f})"
438
406
  )
439
- except Exception as e:
440
- logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
441
- ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
442
- if self.config.fail_fast:
443
- raise
407
+
408
+ except Exception as e:
409
+ logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
410
+ ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
411
+ if self.config.fail_fast:
412
+ raise
444
413
 
445
414
  logger.info(f"Taxonomy produced {total_results} labels across {len(ctx.taxonomy_results)} statement-taxonomy pairs")
446
415
  ctx.record_timing(stage_name, time.time() - start_time)
@@ -14,9 +14,10 @@ if TYPE_CHECKING:
14
14
  BaseSplitterPlugin,
15
15
  BaseExtractorPlugin,
16
16
  BaseQualifierPlugin,
17
- BaseCanonicalizerPlugin,
18
17
  BaseLabelerPlugin,
19
18
  BaseTaxonomyPlugin,
19
+ BaseScraperPlugin,
20
+ BasePDFParserPlugin,
20
21
  )
21
22
  from ..models import EntityType
22
23
 
@@ -37,13 +38,15 @@ class PluginRegistry:
37
38
  _splitters: list["BaseSplitterPlugin"] = []
38
39
  _extractors: list["BaseExtractorPlugin"] = []
39
40
  _qualifiers: list["BaseQualifierPlugin"] = []
40
- _canonicalizers: list["BaseCanonicalizerPlugin"] = []
41
41
  _labelers: list["BaseLabelerPlugin"] = []
42
42
  _taxonomy_classifiers: list["BaseTaxonomyPlugin"] = []
43
43
 
44
+ # Content acquisition plugins
45
+ _scrapers: list["BaseScraperPlugin"] = []
46
+ _pdf_parsers: list["BasePDFParserPlugin"] = []
47
+
44
48
  # Index by entity type for quick lookup
45
49
  _qualifiers_by_type: dict["EntityType", list["BaseQualifierPlugin"]] = {}
46
- _canonicalizers_by_type: dict["EntityType", list["BaseCanonicalizerPlugin"]] = {}
47
50
 
48
51
  # Index by name for CLI lookup
49
52
  _all_plugins: dict[str, "BasePlugin"] = {}
@@ -54,11 +57,11 @@ class PluginRegistry:
54
57
  cls._splitters = []
55
58
  cls._extractors = []
56
59
  cls._qualifiers = []
57
- cls._canonicalizers = []
58
60
  cls._labelers = []
59
61
  cls._taxonomy_classifiers = []
62
+ cls._scrapers = []
63
+ cls._pdf_parsers = []
60
64
  cls._qualifiers_by_type = {}
61
- cls._canonicalizers_by_type = {}
62
65
  cls._all_plugins = {}
63
66
 
64
67
  # =========================================================================
@@ -100,25 +103,6 @@ class PluginRegistry:
100
103
  f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
101
104
  )
102
105
 
103
- @classmethod
104
- def register_canonicalizer(cls, plugin: "BaseCanonicalizerPlugin") -> None:
105
- """Register a canonicalizer plugin."""
106
- cls._canonicalizers.append(plugin)
107
- cls._canonicalizers.sort(key=lambda p: p.priority)
108
- cls._all_plugins[plugin.name] = plugin
109
-
110
- # Index by entity type
111
- for entity_type in plugin.supported_entity_types:
112
- if entity_type not in cls._canonicalizers_by_type:
113
- cls._canonicalizers_by_type[entity_type] = []
114
- cls._canonicalizers_by_type[entity_type].append(plugin)
115
- cls._canonicalizers_by_type[entity_type].sort(key=lambda p: p.priority)
116
-
117
- logger.debug(
118
- f"Registered canonicalizer: {plugin.name} "
119
- f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
120
- )
121
-
122
106
  @classmethod
123
107
  def register_labeler(cls, plugin: "BaseLabelerPlugin") -> None:
124
108
  """Register a labeler plugin."""
@@ -135,6 +119,22 @@ class PluginRegistry:
135
119
  cls._all_plugins[plugin.name] = plugin
136
120
  logger.debug(f"Registered taxonomy: {plugin.name} (priority={plugin.priority})")
137
121
 
122
+ @classmethod
123
+ def register_scraper(cls, plugin: "BaseScraperPlugin") -> None:
124
+ """Register a scraper plugin."""
125
+ cls._scrapers.append(plugin)
126
+ cls._scrapers.sort(key=lambda p: p.priority)
127
+ cls._all_plugins[plugin.name] = plugin
128
+ logger.debug(f"Registered scraper: {plugin.name} (priority={plugin.priority})")
129
+
130
+ @classmethod
131
+ def register_pdf_parser(cls, plugin: "BasePDFParserPlugin") -> None:
132
+ """Register a PDF parser plugin."""
133
+ cls._pdf_parsers.append(plugin)
134
+ cls._pdf_parsers.sort(key=lambda p: p.priority)
135
+ cls._all_plugins[plugin.name] = plugin
136
+ logger.debug(f"Registered PDF parser: {plugin.name} (priority={plugin.priority})")
137
+
138
138
  # =========================================================================
139
139
  # Decorator registration
140
140
  # =========================================================================
@@ -157,12 +157,6 @@ class PluginRegistry:
157
157
  cls.register_qualifier(plugin_class())
158
158
  return plugin_class
159
159
 
160
- @classmethod
161
- def canonicalizer(cls, plugin_class: Type[T]) -> Type[T]:
162
- """Decorator to register a canonicalizer plugin class."""
163
- cls.register_canonicalizer(plugin_class())
164
- return plugin_class
165
-
166
160
  @classmethod
167
161
  def labeler(cls, plugin_class: Type[T]) -> Type[T]:
168
162
  """Decorator to register a labeler plugin class."""
@@ -175,6 +169,18 @@ class PluginRegistry:
175
169
  cls.register_taxonomy(plugin_class())
176
170
  return plugin_class
177
171
 
172
+ @classmethod
173
+ def scraper(cls, plugin_class: Type[T]) -> Type[T]:
174
+ """Decorator to register a scraper plugin class."""
175
+ cls.register_scraper(plugin_class())
176
+ return plugin_class
177
+
178
+ @classmethod
179
+ def pdf_parser(cls, plugin_class: Type[T]) -> Type[T]:
180
+ """Decorator to register a PDF parser plugin class."""
181
+ cls.register_pdf_parser(plugin_class())
182
+ return plugin_class
183
+
178
184
  # =========================================================================
179
185
  # Retrieval methods
180
186
  # =========================================================================
@@ -199,16 +205,6 @@ class PluginRegistry:
199
205
  """Get qualifier plugins that support a specific entity type."""
200
206
  return cls._qualifiers_by_type.get(entity_type, []).copy()
201
207
 
202
- @classmethod
203
- def get_canonicalizers(cls) -> list["BaseCanonicalizerPlugin"]:
204
- """Get all registered canonicalizer plugins (sorted by priority)."""
205
- return cls._canonicalizers.copy()
206
-
207
- @classmethod
208
- def get_canonicalizers_for_type(cls, entity_type: "EntityType") -> list["BaseCanonicalizerPlugin"]:
209
- """Get canonicalizer plugins that support a specific entity type."""
210
- return cls._canonicalizers_by_type.get(entity_type, []).copy()
211
-
212
208
  @classmethod
213
209
  def get_labelers(cls) -> list["BaseLabelerPlugin"]:
214
210
  """Get all registered labeler plugins (sorted by priority)."""
@@ -219,6 +215,16 @@ class PluginRegistry:
219
215
  """Get all registered taxonomy classifier plugins (sorted by priority)."""
220
216
  return cls._taxonomy_classifiers.copy()
221
217
 
218
+ @classmethod
219
+ def get_scrapers(cls) -> list["BaseScraperPlugin"]:
220
+ """Get all registered scraper plugins (sorted by priority)."""
221
+ return cls._scrapers.copy()
222
+
223
+ @classmethod
224
+ def get_pdf_parsers(cls) -> list["BasePDFParserPlugin"]:
225
+ """Get all registered PDF parser plugins (sorted by priority)."""
226
+ return cls._pdf_parsers.copy()
227
+
222
228
  @classmethod
223
229
  def get_plugin(cls, name: str) -> "BasePlugin | None":
224
230
  """Get a plugin by name."""
@@ -239,10 +245,8 @@ class PluginRegistry:
239
245
  elif stage == 3:
240
246
  return cls._qualifiers.copy()
241
247
  elif stage == 4:
242
- return cls._canonicalizers.copy()
243
- elif stage == 5:
244
248
  return cls._labelers.copy()
245
- elif stage == 6:
249
+ elif stage == 5:
246
250
  return cls._taxonomy_classifiers.copy()
247
251
  return []
248
252
 
@@ -267,9 +271,11 @@ class PluginRegistry:
267
271
  (1, "splitting", cls._splitters),
268
272
  (2, "extraction", cls._extractors),
269
273
  (3, "qualification", cls._qualifiers),
270
- (4, "canonicalization", cls._canonicalizers),
271
- (5, "labeling", cls._labelers),
272
- (6, "taxonomy", cls._taxonomy_classifiers),
274
+ (4, "labeling", cls._labelers),
275
+ (5, "taxonomy", cls._taxonomy_classifiers),
276
+ # Content acquisition plugins (stage 0)
277
+ (0, "scraper", cls._scrapers),
278
+ (-1, "pdf_parser", cls._pdf_parsers),
273
279
  ]
274
280
 
275
281
  for stage_num, stage_name, plugins in plugins_by_stage: