corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -2,11 +2,11 @@
2
2
  ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
3
3
 
4
4
  Coordinates the flow of data through all pipeline stages:
5
- 1. Splitting: Text → RawTriple
6
- 2. Extraction: RawTriple → PipelineStatement
7
- 3. Qualification: Entity → QualifiedEntity
8
- 4. Canonicalization: QualifiedEntityCanonicalEntity
9
- 5. Labeling: Statement → LabeledStatement
5
+ 1. Splitting: Text → SplitSentence (atomic sentences)
6
+ 2. Extraction: SplitSentence → PipelineStatement (subject-predicate-object triples)
7
+ 3. Qualification: Entity → CanonicalEntity
8
+ 4. Labeling: StatementLabeledStatement
9
+ 5. Taxonomy: Statement → TaxonomyResult
10
10
  """
11
11
 
12
12
  import logging
@@ -18,7 +18,6 @@ from .config import PipelineConfig, get_stage_name
18
18
  from .registry import PluginRegistry
19
19
  from ..models import (
20
20
  QualifiedEntity,
21
- EntityQualifiers,
22
21
  CanonicalEntity,
23
22
  LabeledStatement,
24
23
  TaxonomyResult,
@@ -31,8 +30,12 @@ class ExtractionPipeline:
31
30
  """
32
31
  Main pipeline orchestrator.
33
32
 
34
- Coordinates the flow of data through all 5 stages, invoking registered
35
- plugins in priority order and accumulating results in PipelineContext.
33
+ Coordinates the flow of data through all 5 stages:
34
+ 1. Splitting: Text SplitSentence (using splitter plugins)
35
+ 2. Extraction: SplitSentence → PipelineStatement (using extractor plugins)
36
+ 3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
37
+ 4. Labeling: Statement → LabeledStatement (using labeler plugins)
38
+ 5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
36
39
  """
37
40
 
38
41
  def __init__(self, config: Optional[PipelineConfig] = None):
@@ -86,20 +89,16 @@ class ExtractionPipeline:
86
89
  if self.config.is_stage_enabled(2):
87
90
  ctx = self._run_extraction(ctx)
88
91
 
89
- # Stage 3: Qualification
92
+ # Stage 3: Qualification (runs qualifiers + canonicalizers)
90
93
  if self.config.is_stage_enabled(3):
91
94
  ctx = self._run_qualification(ctx)
92
95
 
93
- # Stage 4: Canonicalization
96
+ # Stage 4: Labeling
94
97
  if self.config.is_stage_enabled(4):
95
- ctx = self._run_canonicalization(ctx)
96
-
97
- # Stage 5: Labeling
98
- if self.config.is_stage_enabled(5):
99
98
  ctx = self._run_labeling(ctx)
100
99
 
101
- # Stage 6: Taxonomy classification
102
- if self.config.is_stage_enabled(6):
100
+ # Stage 5: Taxonomy classification
101
+ if self.config.is_stage_enabled(5):
103
102
  ctx = self._run_taxonomy(ctx)
104
103
 
105
104
  except Exception as e:
@@ -116,7 +115,7 @@ class ExtractionPipeline:
116
115
  return ctx
117
116
 
118
117
  def _run_splitting(self, ctx: PipelineContext) -> PipelineContext:
119
- """Stage 1: Split text into raw triples."""
118
+ """Stage 1: Split text into atomic sentences."""
120
119
  stage_name = get_stage_name(1)
121
120
  logger.debug(f"Running {stage_name} stage")
122
121
  start_time = time.time()
@@ -133,9 +132,9 @@ class ExtractionPipeline:
133
132
 
134
133
  logger.debug(f"Using splitter: {splitter.name}")
135
134
  try:
136
- raw_triples = splitter.split(ctx.source_text, ctx)
137
- ctx.raw_triples = raw_triples
138
- logger.info(f"Splitting produced {len(raw_triples)} raw triples")
135
+ split_sentences = splitter.split(ctx.source_text, ctx)
136
+ ctx.split_sentences = split_sentences
137
+ logger.info(f"Splitting produced {len(split_sentences)} sentences")
139
138
  break
140
139
  except Exception as e:
141
140
  logger.exception(f"Splitter {splitter.name} failed")
@@ -147,13 +146,13 @@ class ExtractionPipeline:
147
146
  return ctx
148
147
 
149
148
  def _run_extraction(self, ctx: PipelineContext) -> PipelineContext:
150
- """Stage 2: Extract statements with typed entities from raw triples."""
149
+ """Stage 2: Extract subject-predicate-object triples from split sentences."""
151
150
  stage_name = get_stage_name(2)
152
151
  logger.debug(f"Running {stage_name} stage")
153
152
  start_time = time.time()
154
153
 
155
- if not ctx.raw_triples:
156
- logger.debug("No raw triples to extract from")
154
+ if not ctx.split_sentences:
155
+ logger.debug("No split sentences to extract from")
157
156
  return ctx
158
157
 
159
158
  extractors = PluginRegistry.get_extractors()
@@ -178,7 +177,7 @@ class ExtractionPipeline:
178
177
 
179
178
  logger.debug(f"Using extractor: {extractor.name}")
180
179
  try:
181
- statements = extractor.extract(ctx.raw_triples, ctx)
180
+ statements = extractor.extract(ctx.split_sentences, ctx)
182
181
  ctx.statements = statements
183
182
  logger.info(f"Extraction produced {len(statements)} statements")
184
183
  break
@@ -211,7 +210,12 @@ class ExtractionPipeline:
211
210
  return schemas
212
211
 
213
212
  def _run_qualification(self, ctx: PipelineContext) -> PipelineContext:
214
- """Stage 3: Add qualifiers to entities."""
213
+ """
214
+ Stage 3: Qualify entities with identifiers, canonical names, and FQNs.
215
+
216
+ Runs qualifier plugins for each entity type. Qualifier plugins now return
217
+ CanonicalEntity directly (with qualifiers, canonical match, and FQN).
218
+ """
215
219
  stage_name = get_stage_name(3)
216
220
  logger.debug(f"Running {stage_name} stage")
217
221
  start_time = time.time()
@@ -227,14 +231,15 @@ class ExtractionPipeline:
227
231
  if entity.entity_ref not in entities_to_qualify:
228
232
  entities_to_qualify[entity.entity_ref] = entity
229
233
 
230
- logger.debug(f"Qualifying {len(entities_to_qualify)} unique entities")
234
+ logger.info(f"Stage 3: Qualifying {len(entities_to_qualify)} unique entities")
231
235
 
232
- # Qualify each entity using applicable plugins
233
- for entity_ref, entity in entities_to_qualify.items():
234
- qualifiers = EntityQualifiers()
235
- sources = []
236
+ # Process each entity through qualifier plugins
237
+ entities_list = list(entities_to_qualify.items())
238
+ for idx, (entity_ref, entity) in enumerate(entities_list, 1):
239
+ logger.info(f" [{idx}/{len(entities_list)}] Qualifying '{entity.text}' ({entity.type.value})")
236
240
 
237
- # Get qualifiers for this entity type
241
+ # Run qualifier plugins - first one to return a result wins
242
+ canonical = None
238
243
  type_qualifiers = PluginRegistry.get_qualifiers_for_type(entity.type)
239
244
 
240
245
  for qualifier_plugin in type_qualifiers:
@@ -242,86 +247,36 @@ class ExtractionPipeline:
242
247
  continue
243
248
 
244
249
  try:
245
- plugin_qualifiers = qualifier_plugin.qualify(entity, ctx)
246
- if plugin_qualifiers and plugin_qualifiers.has_any_qualifier():
247
- qualifiers = qualifiers.merge_with(plugin_qualifiers)
248
- sources.append(qualifier_plugin.name)
250
+ result = qualifier_plugin.qualify(entity, ctx)
251
+ if result is not None:
252
+ canonical = result
253
+ logger.info(f" Qualified by {qualifier_plugin.name}: {canonical.fqn}")
254
+ break # Use first successful match
249
255
  except Exception as e:
250
256
  logger.error(f"Qualifier {qualifier_plugin.name} failed for {entity.text}: {e}")
251
257
  ctx.add_error(f"Qualifier {qualifier_plugin.name} failed: {str(e)}")
252
258
  if self.config.fail_fast:
253
259
  raise
254
260
 
255
- # Create QualifiedEntity
256
- qualified = QualifiedEntity(
257
- entity_ref=entity_ref,
258
- original_text=entity.text,
259
- entity_type=entity.type,
260
- qualifiers=qualifiers,
261
- qualification_sources=sources,
262
- )
263
- ctx.qualified_entities[entity_ref] = qualified
264
-
265
- logger.info(f"Qualified {len(ctx.qualified_entities)} entities")
266
- ctx.record_timing(stage_name, time.time() - start_time)
267
- return ctx
268
-
269
- def _run_canonicalization(self, ctx: PipelineContext) -> PipelineContext:
270
- """Stage 4: Resolve entities to canonical forms."""
271
- stage_name = get_stage_name(4)
272
- logger.debug(f"Running {stage_name} stage")
273
- start_time = time.time()
274
-
275
- if not ctx.qualified_entities:
276
- # Create basic qualified entities if stage 3 was skipped
277
- for stmt in ctx.statements:
278
- for entity in [stmt.subject, stmt.object]:
279
- if entity.entity_ref not in ctx.qualified_entities:
280
- ctx.qualified_entities[entity.entity_ref] = QualifiedEntity(
281
- entity_ref=entity.entity_ref,
282
- original_text=entity.text,
283
- entity_type=entity.type,
284
- )
285
-
286
- # Canonicalize each qualified entity
287
- for entity_ref, qualified in ctx.qualified_entities.items():
288
- canonical_match = None
289
- fqn = None
290
-
291
- # Get canonicalizers for this entity type
292
- type_canonicalizers = PluginRegistry.get_canonicalizers_for_type(qualified.entity_type)
293
-
294
- for canon_plugin in type_canonicalizers:
295
- if not self.config.is_plugin_enabled(canon_plugin.name):
296
- continue
297
-
298
- try:
299
- match = canon_plugin.find_canonical(qualified, ctx)
300
- if match:
301
- canonical_match = match
302
- fqn = canon_plugin.format_fqn(qualified, match)
303
- break # Use first successful match
304
- except Exception as e:
305
- logger.error(f"Canonicalizer {canon_plugin.name} failed for {qualified.original_text}: {e}")
306
- ctx.add_error(f"Canonicalizer {canon_plugin.name} failed: {str(e)}")
307
- if self.config.fail_fast:
308
- raise
261
+ # Create fallback CanonicalEntity if no plugin matched
262
+ if canonical is None:
263
+ qualified = QualifiedEntity(
264
+ entity_ref=entity_ref,
265
+ original_text=entity.text,
266
+ entity_type=entity.type,
267
+ )
268
+ canonical = CanonicalEntity.from_qualified(qualified=qualified)
269
+ logger.debug(f" No qualification found, using original text")
309
270
 
310
- # Create CanonicalEntity
311
- canonical = CanonicalEntity.from_qualified(
312
- qualified=qualified,
313
- canonical_match=canonical_match,
314
- fqn=fqn,
315
- )
316
271
  ctx.canonical_entities[entity_ref] = canonical
317
272
 
318
- logger.info(f"Canonicalized {len(ctx.canonical_entities)} entities")
273
+ logger.info(f"Qualified {len(ctx.canonical_entities)} entities")
319
274
  ctx.record_timing(stage_name, time.time() - start_time)
320
275
  return ctx
321
276
 
322
277
  def _run_labeling(self, ctx: PipelineContext) -> PipelineContext:
323
- """Stage 5: Apply labels to statements."""
324
- stage_name = get_stage_name(5)
278
+ """Stage 4: Apply labels to statements."""
279
+ stage_name = get_stage_name(4)
325
280
  logger.debug(f"Running {stage_name} stage")
326
281
  start_time = time.time()
327
282
 
@@ -329,9 +284,9 @@ class ExtractionPipeline:
329
284
  logger.debug("No statements to label")
330
285
  return ctx
331
286
 
332
- # Ensure canonical entities exist
287
+ # Ensure canonical entities exist (run qualification if skipped)
333
288
  if not ctx.canonical_entities:
334
- self._run_canonicalization(ctx)
289
+ self._run_qualification(ctx)
335
290
 
336
291
  labelers = PluginRegistry.get_labelers()
337
292
 
@@ -393,8 +348,10 @@ class ExtractionPipeline:
393
348
  return ctx
394
349
 
395
350
  def _run_taxonomy(self, ctx: PipelineContext) -> PipelineContext:
396
- """Stage 6: Classify statements against taxonomies."""
397
- stage_name = get_stage_name(6)
351
+ """Stage 5: Classify statements against taxonomies."""
352
+ from ..plugins.base import PluginCapability
353
+
354
+ stage_name = get_stage_name(5)
398
355
  logger.debug(f"Running {stage_name} stage")
399
356
  start_time = time.time()
400
357
 
@@ -408,27 +365,38 @@ class ExtractionPipeline:
408
365
  return ctx
409
366
 
410
367
  total_results = 0
411
- for labeled_stmt in ctx.labeled_statements:
412
- stmt = labeled_stmt.statement
413
- subj_canonical = labeled_stmt.subject_canonical
414
- obj_canonical = labeled_stmt.object_canonical
415
-
416
- # Apply all taxonomy classifiers
417
- for classifier in taxonomy_classifiers:
418
- if not self.config.is_plugin_enabled(classifier.name):
419
- continue
420
368
 
421
- try:
422
- results = classifier.classify(stmt, subj_canonical, obj_canonical, ctx)
369
+ # Prepare batch items: list of (statement, subject_canonical, object_canonical)
370
+ batch_items = [
371
+ (labeled_stmt.statement, labeled_stmt.subject_canonical, labeled_stmt.object_canonical)
372
+ for labeled_stmt in ctx.labeled_statements
373
+ ]
374
+
375
+ # Apply all taxonomy classifiers
376
+ for classifier in taxonomy_classifiers:
377
+ if not self.config.is_plugin_enabled(classifier.name):
378
+ continue
379
+
380
+ try:
381
+ # Require batch processing capability
382
+ if PluginCapability.BATCH_PROCESSING not in classifier.capabilities:
383
+ raise RuntimeError(
384
+ f"Taxonomy classifier '{classifier.name}' does not support batch processing. "
385
+ "Pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
386
+ )
387
+
388
+ logger.debug(f"Using batch classification for {classifier.name} ({len(batch_items)} items)")
389
+ batch_results = classifier.classify_batch(batch_items, ctx)
390
+
391
+ # Apply results to each labeled statement
392
+ for labeled_stmt, results in zip(ctx.labeled_statements, batch_results):
423
393
  if results:
424
- # Store taxonomy results in context (list of results per key)
394
+ stmt = labeled_stmt.statement
425
395
  key = (stmt.source_text, classifier.taxonomy_name)
426
396
  if key not in ctx.taxonomy_results:
427
397
  ctx.taxonomy_results[key] = []
428
398
  ctx.taxonomy_results[key].extend(results)
429
399
  total_results += len(results)
430
-
431
- # Also add to the labeled statement for easy access
432
400
  labeled_stmt.taxonomy_results.extend(results)
433
401
 
434
402
  for result in results:
@@ -436,11 +404,12 @@ class ExtractionPipeline:
436
404
  f"Taxonomy {classifier.name}: {result.full_label} "
437
405
  f"(confidence={result.confidence:.2f})"
438
406
  )
439
- except Exception as e:
440
- logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
441
- ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
442
- if self.config.fail_fast:
443
- raise
407
+
408
+ except Exception as e:
409
+ logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
410
+ ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
411
+ if self.config.fail_fast:
412
+ raise
444
413
 
445
414
  logger.info(f"Taxonomy produced {total_results} labels across {len(ctx.taxonomy_results)} statement-taxonomy pairs")
446
415
  ctx.record_timing(stage_name, time.time() - start_time)
@@ -14,9 +14,10 @@ if TYPE_CHECKING:
14
14
  BaseSplitterPlugin,
15
15
  BaseExtractorPlugin,
16
16
  BaseQualifierPlugin,
17
- BaseCanonicalizerPlugin,
18
17
  BaseLabelerPlugin,
19
18
  BaseTaxonomyPlugin,
19
+ BaseScraperPlugin,
20
+ BasePDFParserPlugin,
20
21
  )
21
22
  from ..models import EntityType
22
23
 
@@ -37,13 +38,15 @@ class PluginRegistry:
37
38
  _splitters: list["BaseSplitterPlugin"] = []
38
39
  _extractors: list["BaseExtractorPlugin"] = []
39
40
  _qualifiers: list["BaseQualifierPlugin"] = []
40
- _canonicalizers: list["BaseCanonicalizerPlugin"] = []
41
41
  _labelers: list["BaseLabelerPlugin"] = []
42
42
  _taxonomy_classifiers: list["BaseTaxonomyPlugin"] = []
43
43
 
44
+ # Content acquisition plugins
45
+ _scrapers: list["BaseScraperPlugin"] = []
46
+ _pdf_parsers: list["BasePDFParserPlugin"] = []
47
+
44
48
  # Index by entity type for quick lookup
45
49
  _qualifiers_by_type: dict["EntityType", list["BaseQualifierPlugin"]] = {}
46
- _canonicalizers_by_type: dict["EntityType", list["BaseCanonicalizerPlugin"]] = {}
47
50
 
48
51
  # Index by name for CLI lookup
49
52
  _all_plugins: dict[str, "BasePlugin"] = {}
@@ -54,11 +57,11 @@ class PluginRegistry:
54
57
  cls._splitters = []
55
58
  cls._extractors = []
56
59
  cls._qualifiers = []
57
- cls._canonicalizers = []
58
60
  cls._labelers = []
59
61
  cls._taxonomy_classifiers = []
62
+ cls._scrapers = []
63
+ cls._pdf_parsers = []
60
64
  cls._qualifiers_by_type = {}
61
- cls._canonicalizers_by_type = {}
62
65
  cls._all_plugins = {}
63
66
 
64
67
  # =========================================================================
@@ -100,25 +103,6 @@ class PluginRegistry:
100
103
  f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
101
104
  )
102
105
 
103
- @classmethod
104
- def register_canonicalizer(cls, plugin: "BaseCanonicalizerPlugin") -> None:
105
- """Register a canonicalizer plugin."""
106
- cls._canonicalizers.append(plugin)
107
- cls._canonicalizers.sort(key=lambda p: p.priority)
108
- cls._all_plugins[plugin.name] = plugin
109
-
110
- # Index by entity type
111
- for entity_type in plugin.supported_entity_types:
112
- if entity_type not in cls._canonicalizers_by_type:
113
- cls._canonicalizers_by_type[entity_type] = []
114
- cls._canonicalizers_by_type[entity_type].append(plugin)
115
- cls._canonicalizers_by_type[entity_type].sort(key=lambda p: p.priority)
116
-
117
- logger.debug(
118
- f"Registered canonicalizer: {plugin.name} "
119
- f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
120
- )
121
-
122
106
  @classmethod
123
107
  def register_labeler(cls, plugin: "BaseLabelerPlugin") -> None:
124
108
  """Register a labeler plugin."""
@@ -135,6 +119,22 @@ class PluginRegistry:
135
119
  cls._all_plugins[plugin.name] = plugin
136
120
  logger.debug(f"Registered taxonomy: {plugin.name} (priority={plugin.priority})")
137
121
 
122
+ @classmethod
123
+ def register_scraper(cls, plugin: "BaseScraperPlugin") -> None:
124
+ """Register a scraper plugin."""
125
+ cls._scrapers.append(plugin)
126
+ cls._scrapers.sort(key=lambda p: p.priority)
127
+ cls._all_plugins[plugin.name] = plugin
128
+ logger.debug(f"Registered scraper: {plugin.name} (priority={plugin.priority})")
129
+
130
+ @classmethod
131
+ def register_pdf_parser(cls, plugin: "BasePDFParserPlugin") -> None:
132
+ """Register a PDF parser plugin."""
133
+ cls._pdf_parsers.append(plugin)
134
+ cls._pdf_parsers.sort(key=lambda p: p.priority)
135
+ cls._all_plugins[plugin.name] = plugin
136
+ logger.debug(f"Registered PDF parser: {plugin.name} (priority={plugin.priority})")
137
+
138
138
  # =========================================================================
139
139
  # Decorator registration
140
140
  # =========================================================================
@@ -157,12 +157,6 @@ class PluginRegistry:
157
157
  cls.register_qualifier(plugin_class())
158
158
  return plugin_class
159
159
 
160
- @classmethod
161
- def canonicalizer(cls, plugin_class: Type[T]) -> Type[T]:
162
- """Decorator to register a canonicalizer plugin class."""
163
- cls.register_canonicalizer(plugin_class())
164
- return plugin_class
165
-
166
160
  @classmethod
167
161
  def labeler(cls, plugin_class: Type[T]) -> Type[T]:
168
162
  """Decorator to register a labeler plugin class."""
@@ -175,6 +169,18 @@ class PluginRegistry:
175
169
  cls.register_taxonomy(plugin_class())
176
170
  return plugin_class
177
171
 
172
+ @classmethod
173
+ def scraper(cls, plugin_class: Type[T]) -> Type[T]:
174
+ """Decorator to register a scraper plugin class."""
175
+ cls.register_scraper(plugin_class())
176
+ return plugin_class
177
+
178
+ @classmethod
179
+ def pdf_parser(cls, plugin_class: Type[T]) -> Type[T]:
180
+ """Decorator to register a PDF parser plugin class."""
181
+ cls.register_pdf_parser(plugin_class())
182
+ return plugin_class
183
+
178
184
  # =========================================================================
179
185
  # Retrieval methods
180
186
  # =========================================================================
@@ -199,16 +205,6 @@ class PluginRegistry:
199
205
  """Get qualifier plugins that support a specific entity type."""
200
206
  return cls._qualifiers_by_type.get(entity_type, []).copy()
201
207
 
202
- @classmethod
203
- def get_canonicalizers(cls) -> list["BaseCanonicalizerPlugin"]:
204
- """Get all registered canonicalizer plugins (sorted by priority)."""
205
- return cls._canonicalizers.copy()
206
-
207
- @classmethod
208
- def get_canonicalizers_for_type(cls, entity_type: "EntityType") -> list["BaseCanonicalizerPlugin"]:
209
- """Get canonicalizer plugins that support a specific entity type."""
210
- return cls._canonicalizers_by_type.get(entity_type, []).copy()
211
-
212
208
  @classmethod
213
209
  def get_labelers(cls) -> list["BaseLabelerPlugin"]:
214
210
  """Get all registered labeler plugins (sorted by priority)."""
@@ -219,6 +215,16 @@ class PluginRegistry:
219
215
  """Get all registered taxonomy classifier plugins (sorted by priority)."""
220
216
  return cls._taxonomy_classifiers.copy()
221
217
 
218
+ @classmethod
219
+ def get_scrapers(cls) -> list["BaseScraperPlugin"]:
220
+ """Get all registered scraper plugins (sorted by priority)."""
221
+ return cls._scrapers.copy()
222
+
223
+ @classmethod
224
+ def get_pdf_parsers(cls) -> list["BasePDFParserPlugin"]:
225
+ """Get all registered PDF parser plugins (sorted by priority)."""
226
+ return cls._pdf_parsers.copy()
227
+
222
228
  @classmethod
223
229
  def get_plugin(cls, name: str) -> "BasePlugin | None":
224
230
  """Get a plugin by name."""
@@ -239,10 +245,8 @@ class PluginRegistry:
239
245
  elif stage == 3:
240
246
  return cls._qualifiers.copy()
241
247
  elif stage == 4:
242
- return cls._canonicalizers.copy()
243
- elif stage == 5:
244
248
  return cls._labelers.copy()
245
- elif stage == 6:
249
+ elif stage == 5:
246
250
  return cls._taxonomy_classifiers.copy()
247
251
  return []
248
252
 
@@ -267,9 +271,11 @@ class PluginRegistry:
267
271
  (1, "splitting", cls._splitters),
268
272
  (2, "extraction", cls._extractors),
269
273
  (3, "qualification", cls._qualifiers),
270
- (4, "canonicalization", cls._canonicalizers),
271
- (5, "labeling", cls._labelers),
272
- (6, "taxonomy", cls._taxonomy_classifiers),
274
+ (4, "labeling", cls._labelers),
275
+ (5, "taxonomy", cls._taxonomy_classifiers),
276
+ # Content acquisition plugins (stage 0)
277
+ (0, "scraper", cls._scrapers),
278
+ (-1, "pdf_parser", cls._pdf_parsers),
273
279
  ]
274
280
 
275
281
  for stage_num, stage_name, plugins in plugins_by_stage:
@@ -4,10 +4,9 @@ Plugins module for the extraction pipeline.
4
4
  Contains all plugin implementations organized by stage:
5
5
  - splitters/: Stage 1 - Text to atomic triples
6
6
  - extractors/: Stage 2 - Refine entities and relations
7
- - qualifiers/: Stage 3 - Add qualifiers and identifiers
8
- - canonicalizers/: Stage 4 - Resolve canonical forms
9
- - labelers/: Stage 5 - Classify statements
10
- - taxonomy/: Stage 6 - Taxonomy classification
7
+ - qualifiers/: Stage 3 - Qualify entities (add identifiers, canonical names, FQN)
8
+ - labelers/: Stage 4 - Classify statements
9
+ - taxonomy/: Stage 5 - Taxonomy classification
11
10
  """
12
11
 
13
12
  from .base import (
@@ -16,13 +15,20 @@ from .base import (
16
15
  BaseSplitterPlugin,
17
16
  BaseExtractorPlugin,
18
17
  BaseQualifierPlugin,
19
- BaseCanonicalizerPlugin,
20
18
  BaseLabelerPlugin,
21
19
  BaseTaxonomyPlugin,
20
+ # Content acquisition plugins
21
+ ContentType,
22
+ ScraperResult,
23
+ PDFParseResult,
24
+ BaseScraperPlugin,
25
+ BasePDFParserPlugin,
22
26
  )
23
27
 
24
28
  # Import plugin modules for auto-registration
25
- from . import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
29
+ from . import splitters, extractors, qualifiers, labelers, taxonomy
30
+ # Content acquisition plugins
31
+ from . import scrapers, pdf
26
32
 
27
33
  __all__ = [
28
34
  "PluginCapability",
@@ -30,14 +36,20 @@ __all__ = [
30
36
  "BaseSplitterPlugin",
31
37
  "BaseExtractorPlugin",
32
38
  "BaseQualifierPlugin",
33
- "BaseCanonicalizerPlugin",
34
39
  "BaseLabelerPlugin",
35
40
  "BaseTaxonomyPlugin",
41
+ # Content acquisition plugins
42
+ "ContentType",
43
+ "ScraperResult",
44
+ "PDFParseResult",
45
+ "BaseScraperPlugin",
46
+ "BasePDFParserPlugin",
36
47
  # Plugin modules
37
48
  "splitters",
38
49
  "extractors",
39
50
  "qualifiers",
40
- "canonicalizers",
41
51
  "labelers",
42
52
  "taxonomy",
53
+ "scrapers",
54
+ "pdf",
43
55
  ]