corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,177 @@
1
+ """
2
+ PipelineContext - Data container that flows through all pipeline stages.
3
+
4
+ The context accumulates outputs from each stage:
5
+ - Stage 1 (Splitting): raw_triples
6
+ - Stage 2 (Extraction): statements
7
+ - Stage 3 (Qualification): qualified_entities
8
+ - Stage 4 (Canonicalization): canonical_entities
9
+ - Stage 5 (Labeling): labeled_statements
10
+ """
11
+
12
+ from typing import Any, Optional
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ from ..models import (
17
+ RawTriple,
18
+ PipelineStatement,
19
+ QualifiedEntity,
20
+ CanonicalEntity,
21
+ LabeledStatement,
22
+ TaxonomyResult,
23
+ )
24
+
25
+
26
+ class PipelineContext(BaseModel):
27
+ """
28
+ Context object that flows through all pipeline stages.
29
+
30
+ Accumulates outputs from each stage and provides access to
31
+ source text, metadata, and intermediate results.
32
+ """
33
+ # Input
34
+ source_text: str = Field(..., description="Original input text")
35
+ source_metadata: dict[str, Any] = Field(
36
+ default_factory=dict,
37
+ description="Metadata about the source (e.g., document ID, URL, timestamp)"
38
+ )
39
+
40
+ # Stage 1 output: Raw triples from splitting
41
+ raw_triples: list[RawTriple] = Field(
42
+ default_factory=list,
43
+ description="Raw triples from Stage 1 (Splitting)"
44
+ )
45
+
46
+ # Stage 2 output: Statements with extracted entities
47
+ statements: list[PipelineStatement] = Field(
48
+ default_factory=list,
49
+ description="Statements from Stage 2 (Extraction)"
50
+ )
51
+
52
+ # Stage 3 output: Qualified entities (keyed by entity_ref)
53
+ qualified_entities: dict[str, QualifiedEntity] = Field(
54
+ default_factory=dict,
55
+ description="Qualified entities from Stage 3 (Qualification)"
56
+ )
57
+
58
+ # Stage 4 output: Canonical entities (keyed by entity_ref)
59
+ canonical_entities: dict[str, CanonicalEntity] = Field(
60
+ default_factory=dict,
61
+ description="Canonical entities from Stage 4 (Canonicalization)"
62
+ )
63
+
64
+ # Stage 5 output: Final labeled statements
65
+ labeled_statements: list[LabeledStatement] = Field(
66
+ default_factory=list,
67
+ description="Final labeled statements from Stage 5 (Labeling)"
68
+ )
69
+
70
+ # Classification results from extractor (populated by GLiNER2 or similar)
71
+ # Keyed by source_text -> label_type -> (label_value, confidence)
72
+ classification_results: dict[str, dict[str, tuple[str, float]]] = Field(
73
+ default_factory=dict,
74
+ description="Pre-computed classification results from Stage 2 extractor"
75
+ )
76
+
77
+ # Stage 6 output: Taxonomy classifications
78
+ # Keyed by (source_text, taxonomy_name) -> list of TaxonomyResult
79
+ # Multiple labels may match a single statement above threshold
80
+ taxonomy_results: dict[tuple[str, str], list[TaxonomyResult]] = Field(
81
+ default_factory=dict,
82
+ description="Taxonomy classifications from Stage 6 (multiple labels per statement)"
83
+ )
84
+
85
+ # Processing metadata
86
+ processing_errors: list[str] = Field(
87
+ default_factory=list,
88
+ description="Errors encountered during processing"
89
+ )
90
+ processing_warnings: list[str] = Field(
91
+ default_factory=list,
92
+ description="Warnings generated during processing"
93
+ )
94
+ stage_timings: dict[str, float] = Field(
95
+ default_factory=dict,
96
+ description="Timing information for each stage (stage_name -> seconds)"
97
+ )
98
+
99
+ def add_error(self, error: str) -> None:
100
+ """Add a processing error."""
101
+ self.processing_errors.append(error)
102
+
103
+ def add_warning(self, warning: str) -> None:
104
+ """Add a processing warning."""
105
+ self.processing_warnings.append(warning)
106
+
107
+ def record_timing(self, stage: str, duration: float) -> None:
108
+ """Record timing for a stage."""
109
+ self.stage_timings[stage] = duration
110
+
111
+ def get_entity_refs(self) -> set[str]:
112
+ """Get all unique entity refs from statements."""
113
+ refs = set()
114
+ for stmt in self.statements:
115
+ refs.add(stmt.subject.entity_ref)
116
+ refs.add(stmt.object.entity_ref)
117
+ return refs
118
+
119
+ def get_qualified_entity(self, entity_ref: str) -> Optional[QualifiedEntity]:
120
+ """Get qualified entity by ref, or None if not found."""
121
+ return self.qualified_entities.get(entity_ref)
122
+
123
+ def get_canonical_entity(self, entity_ref: str) -> Optional[CanonicalEntity]:
124
+ """Get canonical entity by ref, or None if not found."""
125
+ return self.canonical_entities.get(entity_ref)
126
+
127
+ def get_classification(
128
+ self,
129
+ source_text: str,
130
+ label_type: str,
131
+ ) -> Optional[tuple[str, float]]:
132
+ """
133
+ Get pre-computed classification result for a source text.
134
+
135
+ Args:
136
+ source_text: The source text that was classified
137
+ label_type: The type of label (e.g., "sentiment")
138
+
139
+ Returns:
140
+ Tuple of (label_value, confidence) or None if not found
141
+ """
142
+ if source_text in self.classification_results:
143
+ return self.classification_results[source_text].get(label_type)
144
+ return None
145
+
146
+ def set_classification(
147
+ self,
148
+ source_text: str,
149
+ label_type: str,
150
+ label_value: str,
151
+ confidence: float,
152
+ ) -> None:
153
+ """
154
+ Store a classification result for a source text.
155
+
156
+ Args:
157
+ source_text: The source text that was classified
158
+ label_type: The type of label (e.g., "sentiment")
159
+ label_value: The classification result (e.g., "positive")
160
+ confidence: Confidence score (0.0 to 1.0)
161
+ """
162
+ if source_text not in self.classification_results:
163
+ self.classification_results[source_text] = {}
164
+ self.classification_results[source_text][label_type] = (label_value, confidence)
165
+
166
+ @property
167
+ def has_errors(self) -> bool:
168
+ """Check if any errors occurred during processing."""
169
+ return len(self.processing_errors) > 0
170
+
171
+ @property
172
+ def statement_count(self) -> int:
173
+ """Get the number of statements in the final output."""
174
+ return len(self.labeled_statements) if self.labeled_statements else len(self.statements)
175
+
176
+ class Config:
177
+ arbitrary_types_allowed = True
@@ -0,0 +1,416 @@
1
+ """
2
+ ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
3
+
4
+ Coordinates the flow of data through all pipeline stages:
5
+ 1. Splitting: Text → RawTriple
6
+ 2. Extraction: RawTriple → PipelineStatement
7
+ 3. Qualification: Entity → CanonicalEntity
8
+ 4. Labeling: Statement → LabeledStatement
9
+ 5. Taxonomy: Statement → TaxonomyResult
10
+ """
11
+
12
+ import logging
13
+ import time
14
+ from typing import Any, Optional
15
+
16
+ from .context import PipelineContext
17
+ from .config import PipelineConfig, get_stage_name
18
+ from .registry import PluginRegistry
19
+ from ..models import (
20
+ QualifiedEntity,
21
+ CanonicalEntity,
22
+ LabeledStatement,
23
+ TaxonomyResult,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class ExtractionPipeline:
30
+ """
31
+ Main pipeline orchestrator.
32
+
33
+ Coordinates the flow of data through all 5 stages:
34
+ 1. Splitting: Text → RawTriple (using splitter plugins)
35
+ 2. Extraction: RawTriple → PipelineStatement (using extractor plugins)
36
+ 3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
37
+ 4. Labeling: Statement → LabeledStatement (using labeler plugins)
38
+ 5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
39
+ """
40
+
41
+ def __init__(self, config: Optional[PipelineConfig] = None):
42
+ """
43
+ Initialize the pipeline.
44
+
45
+ Args:
46
+ config: Pipeline configuration (uses defaults if not provided)
47
+ """
48
+ self.config = config or PipelineConfig.default()
49
+
50
+ def process(
51
+ self,
52
+ text: str,
53
+ metadata: Optional[dict[str, Any]] = None,
54
+ ) -> PipelineContext:
55
+ """
56
+ Process text through the extraction pipeline.
57
+
58
+ Args:
59
+ text: Input text to process
60
+ metadata: Optional metadata about the source
61
+
62
+ Returns:
63
+ PipelineContext with accumulated results from all stages
64
+ """
65
+ # Merge config options into metadata for plugins
66
+ combined_metadata = metadata.copy() if metadata else {}
67
+
68
+ # Pass extractor options from config to context
69
+ if self.config.extractor_options:
70
+ existing_extractor_opts = combined_metadata.get("extractor_options", {})
71
+ combined_metadata["extractor_options"] = {
72
+ **self.config.extractor_options,
73
+ **existing_extractor_opts, # Allow explicit metadata to override config
74
+ }
75
+
76
+ ctx = PipelineContext(
77
+ source_text=text,
78
+ source_metadata=combined_metadata,
79
+ )
80
+
81
+ logger.info(f"Starting pipeline processing: {len(text)} chars")
82
+
83
+ try:
84
+ # Stage 1: Splitting
85
+ if self.config.is_stage_enabled(1):
86
+ ctx = self._run_splitting(ctx)
87
+
88
+ # Stage 2: Extraction
89
+ if self.config.is_stage_enabled(2):
90
+ ctx = self._run_extraction(ctx)
91
+
92
+ # Stage 3: Qualification (runs qualifiers + canonicalizers)
93
+ if self.config.is_stage_enabled(3):
94
+ ctx = self._run_qualification(ctx)
95
+
96
+ # Stage 4: Labeling
97
+ if self.config.is_stage_enabled(4):
98
+ ctx = self._run_labeling(ctx)
99
+
100
+ # Stage 5: Taxonomy classification
101
+ if self.config.is_stage_enabled(5):
102
+ ctx = self._run_taxonomy(ctx)
103
+
104
+ except Exception as e:
105
+ logger.exception("Pipeline processing failed")
106
+ ctx.add_error(f"Pipeline error: {str(e)}")
107
+ if self.config.fail_fast:
108
+ raise
109
+
110
+ logger.info(
111
+ f"Pipeline complete: {ctx.statement_count} statements, "
112
+ f"{len(ctx.processing_errors)} errors"
113
+ )
114
+
115
+ return ctx
116
+
117
+ def _run_splitting(self, ctx: PipelineContext) -> PipelineContext:
118
+ """Stage 1: Split text into raw triples."""
119
+ stage_name = get_stage_name(1)
120
+ logger.debug(f"Running {stage_name} stage")
121
+ start_time = time.time()
122
+
123
+ splitters = PluginRegistry.get_splitters()
124
+ if not splitters:
125
+ ctx.add_warning("No splitter plugins registered")
126
+ return ctx
127
+
128
+ # Use first enabled splitter (highest priority)
129
+ for splitter in splitters:
130
+ if not self.config.is_plugin_enabled(splitter.name):
131
+ continue
132
+
133
+ logger.debug(f"Using splitter: {splitter.name}")
134
+ try:
135
+ raw_triples = splitter.split(ctx.source_text, ctx)
136
+ ctx.raw_triples = raw_triples
137
+ logger.info(f"Splitting produced {len(raw_triples)} raw triples")
138
+ break
139
+ except Exception as e:
140
+ logger.exception(f"Splitter {splitter.name} failed")
141
+ ctx.add_error(f"Splitter {splitter.name} failed: {str(e)}")
142
+ if self.config.fail_fast:
143
+ raise
144
+
145
+ ctx.record_timing(stage_name, time.time() - start_time)
146
+ return ctx
147
+
148
+ def _run_extraction(self, ctx: PipelineContext) -> PipelineContext:
149
+ """Stage 2: Extract statements with typed entities from raw triples."""
150
+ stage_name = get_stage_name(2)
151
+ logger.debug(f"Running {stage_name} stage")
152
+ start_time = time.time()
153
+
154
+ if not ctx.raw_triples:
155
+ logger.debug("No raw triples to extract from")
156
+ return ctx
157
+
158
+ extractors = PluginRegistry.get_extractors()
159
+ if not extractors:
160
+ ctx.add_warning("No extractor plugins registered")
161
+ return ctx
162
+
163
+ # Collect classification schemas from labelers for the extractor
164
+ classification_schemas = self._collect_classification_schemas()
165
+ if classification_schemas:
166
+ logger.debug(f"Collected {len(classification_schemas)} classification schemas from labelers")
167
+
168
+ # Use first enabled extractor (highest priority)
169
+ for extractor in extractors:
170
+ if not self.config.is_plugin_enabled(extractor.name):
171
+ continue
172
+
173
+ # Pass classification schemas to extractor if it supports them
174
+ if classification_schemas and hasattr(extractor, 'add_classification_schema'):
175
+ for schema in classification_schemas:
176
+ extractor.add_classification_schema(schema)
177
+
178
+ logger.debug(f"Using extractor: {extractor.name}")
179
+ try:
180
+ statements = extractor.extract(ctx.raw_triples, ctx)
181
+ ctx.statements = statements
182
+ logger.info(f"Extraction produced {len(statements)} statements")
183
+ break
184
+ except Exception as e:
185
+ logger.exception(f"Extractor {extractor.name} failed")
186
+ ctx.add_error(f"Extractor {extractor.name} failed: {str(e)}")
187
+ if self.config.fail_fast:
188
+ raise
189
+
190
+ ctx.record_timing(stage_name, time.time() - start_time)
191
+ return ctx
192
+
193
+ def _collect_classification_schemas(self) -> list:
194
+ """Collect classification schemas from enabled labelers."""
195
+ schemas = []
196
+ labelers = PluginRegistry.get_labelers()
197
+
198
+ for labeler in labelers:
199
+ if not self.config.is_plugin_enabled(labeler.name):
200
+ continue
201
+
202
+ # Check for classification schema (simple multi-choice)
203
+ if hasattr(labeler, 'classification_schema') and labeler.classification_schema:
204
+ schemas.append(labeler.classification_schema)
205
+ logger.debug(
206
+ f"Labeler {labeler.name} provides classification schema: "
207
+ f"{labeler.classification_schema}"
208
+ )
209
+
210
+ return schemas
211
+
212
+ def _run_qualification(self, ctx: PipelineContext) -> PipelineContext:
213
+ """
214
+ Stage 3: Qualify entities with identifiers, canonical names, and FQNs.
215
+
216
+ Runs qualifier plugins for each entity type. Qualifier plugins now return
217
+ CanonicalEntity directly (with qualifiers, canonical match, and FQN).
218
+ """
219
+ stage_name = get_stage_name(3)
220
+ logger.debug(f"Running {stage_name} stage")
221
+ start_time = time.time()
222
+
223
+ if not ctx.statements:
224
+ logger.debug("No statements to qualify")
225
+ return ctx
226
+
227
+ # Collect all unique entities from statements
228
+ entities_to_qualify = {}
229
+ for stmt in ctx.statements:
230
+ for entity in [stmt.subject, stmt.object]:
231
+ if entity.entity_ref not in entities_to_qualify:
232
+ entities_to_qualify[entity.entity_ref] = entity
233
+
234
+ logger.info(f"Stage 3: Qualifying {len(entities_to_qualify)} unique entities")
235
+
236
+ # Process each entity through qualifier plugins
237
+ entities_list = list(entities_to_qualify.items())
238
+ for idx, (entity_ref, entity) in enumerate(entities_list, 1):
239
+ logger.info(f" [{idx}/{len(entities_list)}] Qualifying '{entity.text}' ({entity.type.value})")
240
+
241
+ # Run qualifier plugins - first one to return a result wins
242
+ canonical = None
243
+ type_qualifiers = PluginRegistry.get_qualifiers_for_type(entity.type)
244
+
245
+ for qualifier_plugin in type_qualifiers:
246
+ if not self.config.is_plugin_enabled(qualifier_plugin.name):
247
+ continue
248
+
249
+ try:
250
+ result = qualifier_plugin.qualify(entity, ctx)
251
+ if result is not None:
252
+ canonical = result
253
+ logger.info(f" Qualified by {qualifier_plugin.name}: {canonical.fqn}")
254
+ break # Use first successful match
255
+ except Exception as e:
256
+ logger.error(f"Qualifier {qualifier_plugin.name} failed for {entity.text}: {e}")
257
+ ctx.add_error(f"Qualifier {qualifier_plugin.name} failed: {str(e)}")
258
+ if self.config.fail_fast:
259
+ raise
260
+
261
+ # Create fallback CanonicalEntity if no plugin matched
262
+ if canonical is None:
263
+ qualified = QualifiedEntity(
264
+ entity_ref=entity_ref,
265
+ original_text=entity.text,
266
+ entity_type=entity.type,
267
+ )
268
+ canonical = CanonicalEntity.from_qualified(qualified=qualified)
269
+ logger.debug(f" No qualification found, using original text")
270
+
271
+ ctx.canonical_entities[entity_ref] = canonical
272
+
273
+ logger.info(f"Qualified {len(ctx.canonical_entities)} entities")
274
+ ctx.record_timing(stage_name, time.time() - start_time)
275
+ return ctx
276
+
277
+ def _run_labeling(self, ctx: PipelineContext) -> PipelineContext:
278
+ """Stage 4: Apply labels to statements."""
279
+ stage_name = get_stage_name(4)
280
+ logger.debug(f"Running {stage_name} stage")
281
+ start_time = time.time()
282
+
283
+ if not ctx.statements:
284
+ logger.debug("No statements to label")
285
+ return ctx
286
+
287
+ # Ensure canonical entities exist (run qualification if skipped)
288
+ if not ctx.canonical_entities:
289
+ self._run_qualification(ctx)
290
+
291
+ labelers = PluginRegistry.get_labelers()
292
+
293
+ for stmt in ctx.statements:
294
+ # Get canonical entities
295
+ subj_canonical = ctx.canonical_entities.get(stmt.subject.entity_ref)
296
+ obj_canonical = ctx.canonical_entities.get(stmt.object.entity_ref)
297
+
298
+ if not subj_canonical or not obj_canonical:
299
+ # Create fallback canonical entities
300
+ if not subj_canonical:
301
+ subj_qualified = ctx.qualified_entities.get(
302
+ stmt.subject.entity_ref,
303
+ QualifiedEntity(
304
+ entity_ref=stmt.subject.entity_ref,
305
+ original_text=stmt.subject.text,
306
+ entity_type=stmt.subject.type,
307
+ )
308
+ )
309
+ subj_canonical = CanonicalEntity.from_qualified(subj_qualified)
310
+
311
+ if not obj_canonical:
312
+ obj_qualified = ctx.qualified_entities.get(
313
+ stmt.object.entity_ref,
314
+ QualifiedEntity(
315
+ entity_ref=stmt.object.entity_ref,
316
+ original_text=stmt.object.text,
317
+ entity_type=stmt.object.type,
318
+ )
319
+ )
320
+ obj_canonical = CanonicalEntity.from_qualified(obj_qualified)
321
+
322
+ # Create labeled statement
323
+ labeled = LabeledStatement(
324
+ statement=stmt,
325
+ subject_canonical=subj_canonical,
326
+ object_canonical=obj_canonical,
327
+ )
328
+
329
+ # Apply all labelers
330
+ for labeler in labelers:
331
+ if not self.config.is_plugin_enabled(labeler.name):
332
+ continue
333
+
334
+ try:
335
+ label = labeler.label(stmt, subj_canonical, obj_canonical, ctx)
336
+ if label:
337
+ labeled.add_label(label)
338
+ except Exception as e:
339
+ logger.error(f"Labeler {labeler.name} failed: {e}")
340
+ ctx.add_error(f"Labeler {labeler.name} failed: {str(e)}")
341
+ if self.config.fail_fast:
342
+ raise
343
+
344
+ ctx.labeled_statements.append(labeled)
345
+
346
+ logger.info(f"Labeled {len(ctx.labeled_statements)} statements")
347
+ ctx.record_timing(stage_name, time.time() - start_time)
348
+ return ctx
349
+
350
+ def _run_taxonomy(self, ctx: PipelineContext) -> PipelineContext:
351
+ """Stage 5: Classify statements against taxonomies."""
352
+ from ..plugins.base import PluginCapability
353
+
354
+ stage_name = get_stage_name(5)
355
+ logger.debug(f"Running {stage_name} stage")
356
+ start_time = time.time()
357
+
358
+ if not ctx.labeled_statements:
359
+ logger.debug("No labeled statements to classify")
360
+ return ctx
361
+
362
+ taxonomy_classifiers = PluginRegistry.get_taxonomy_classifiers()
363
+ if not taxonomy_classifiers:
364
+ logger.debug("No taxonomy classifiers registered")
365
+ return ctx
366
+
367
+ total_results = 0
368
+
369
+ # Prepare batch items: list of (statement, subject_canonical, object_canonical)
370
+ batch_items = [
371
+ (labeled_stmt.statement, labeled_stmt.subject_canonical, labeled_stmt.object_canonical)
372
+ for labeled_stmt in ctx.labeled_statements
373
+ ]
374
+
375
+ # Apply all taxonomy classifiers
376
+ for classifier in taxonomy_classifiers:
377
+ if not self.config.is_plugin_enabled(classifier.name):
378
+ continue
379
+
380
+ try:
381
+ # Require batch processing capability
382
+ if PluginCapability.BATCH_PROCESSING not in classifier.capabilities:
383
+ raise RuntimeError(
384
+ f"Taxonomy classifier '{classifier.name}' does not support batch processing. "
385
+ "Pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
386
+ )
387
+
388
+ logger.debug(f"Using batch classification for {classifier.name} ({len(batch_items)} items)")
389
+ batch_results = classifier.classify_batch(batch_items, ctx)
390
+
391
+ # Apply results to each labeled statement
392
+ for labeled_stmt, results in zip(ctx.labeled_statements, batch_results):
393
+ if results:
394
+ stmt = labeled_stmt.statement
395
+ key = (stmt.source_text, classifier.taxonomy_name)
396
+ if key not in ctx.taxonomy_results:
397
+ ctx.taxonomy_results[key] = []
398
+ ctx.taxonomy_results[key].extend(results)
399
+ total_results += len(results)
400
+ labeled_stmt.taxonomy_results.extend(results)
401
+
402
+ for result in results:
403
+ logger.debug(
404
+ f"Taxonomy {classifier.name}: {result.full_label} "
405
+ f"(confidence={result.confidence:.2f})"
406
+ )
407
+
408
+ except Exception as e:
409
+ logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
410
+ ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
411
+ if self.config.fail_fast:
412
+ raise
413
+
414
+ logger.info(f"Taxonomy produced {total_results} labels across {len(ctx.taxonomy_results)} statement-taxonomy pairs")
415
+ ctx.record_timing(stage_name, time.time() - start_time)
416
+ return ctx