corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,220 @@
1
+ """
2
+ Label models for the extraction pipeline.
3
+
4
+ StatementLabel: A label applied to a statement
5
+ LabeledStatement: Final output from Stage 5 with all labels
6
+ TaxonomyResult: Taxonomy classification from Stage 6
7
+ """
8
+
9
+ from typing import Any, Optional, Union
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from .statement import PipelineStatement
14
+ from .canonical import CanonicalEntity
15
+
16
+
17
+ class StatementLabel(BaseModel):
18
+ """
19
+ A label applied to a statement in Stage 5 (Labeling).
20
+
21
+ Labels can represent sentiment, relation type, confidence, or
22
+ any other classification applied by labeler plugins.
23
+ """
24
+ label_type: str = Field(
25
+ ...,
26
+ description="Type of label: 'sentiment', 'relation_type', 'confidence', etc."
27
+ )
28
+ label_value: Union[str, float, bool] = Field(
29
+ ...,
30
+ description="The label value (string for classification, float for scores)"
31
+ )
32
+ confidence: float = Field(
33
+ default=1.0,
34
+ ge=0.0,
35
+ le=1.0,
36
+ description="Confidence in this label"
37
+ )
38
+ labeler: Optional[str] = Field(
39
+ None,
40
+ description="Name of the labeler plugin that produced this label"
41
+ )
42
+
43
+ def is_high_confidence(self, threshold: float = 0.8) -> bool:
44
+ """Check if this is a high-confidence label."""
45
+ return self.confidence >= threshold
46
+
47
+
48
+ class LabeledStatement(BaseModel):
49
+ """
50
+ Final output from Stage 5 (Labeling) with taxonomy from Stage 6.
51
+
52
+ Contains the original statement, canonicalized subject and object,
53
+ all labels applied by labeler plugins, and taxonomy classifications.
54
+ """
55
+ statement: PipelineStatement = Field(
56
+ ...,
57
+ description="The original statement from Stage 2"
58
+ )
59
+ subject_canonical: CanonicalEntity = Field(
60
+ ...,
61
+ description="Canonicalized subject entity"
62
+ )
63
+ object_canonical: CanonicalEntity = Field(
64
+ ...,
65
+ description="Canonicalized object entity"
66
+ )
67
+ labels: list[StatementLabel] = Field(
68
+ default_factory=list,
69
+ description="Labels applied to this statement"
70
+ )
71
+ taxonomy_results: list["TaxonomyResult"] = Field(
72
+ default_factory=list,
73
+ description="Taxonomy classifications from Stage 6"
74
+ )
75
+ # Document tracking fields
76
+ document_id: Optional[str] = Field(
77
+ None,
78
+ description="ID of the source document (for document pipeline)"
79
+ )
80
+ page_number: Optional[int] = Field(
81
+ None,
82
+ description="Page number where this statement was extracted (1-indexed)"
83
+ )
84
+ citation: Optional[str] = Field(
85
+ None,
86
+ description="Formatted citation string (e.g., 'Title - Author, 2024, p. 5')"
87
+ )
88
+
89
+ def get_label(self, label_type: str) -> Optional[StatementLabel]:
90
+ """Get a label by type, or None if not found."""
91
+ for label in self.labels:
92
+ if label.label_type == label_type:
93
+ return label
94
+ return None
95
+
96
+ def get_labels_by_type(self, label_type: str) -> list[StatementLabel]:
97
+ """Get all labels of a specific type."""
98
+ return [label for label in self.labels if label.label_type == label_type]
99
+
100
+ def add_label(self, label: StatementLabel) -> None:
101
+ """Add a label to this statement."""
102
+ self.labels.append(label)
103
+
104
+ @property
105
+ def subject_fqn(self) -> str:
106
+ """Get the subject's fully qualified name."""
107
+ return self.subject_canonical.fqn
108
+
109
+ @property
110
+ def object_fqn(self) -> str:
111
+ """Get the object's fully qualified name."""
112
+ return self.object_canonical.fqn
113
+
114
+ def __str__(self) -> str:
115
+ """Format as FQN triple."""
116
+ return f"{self.subject_fqn} --[{self.statement.predicate}]--> {self.object_fqn}"
117
+
118
+ def _build_entity_dict(self, canonical: CanonicalEntity, entity_type: str) -> dict:
119
+ """Build entity dict for serialization."""
120
+ statement_entity = self.statement.subject if entity_type == "subject" else self.statement.object
121
+ fqn = self.subject_fqn if entity_type == "subject" else self.object_fqn
122
+
123
+ # Get canonical_id from identifiers or canonical_match
124
+ identifiers = canonical.qualified_entity.qualifiers.identifiers
125
+ canonical_id = identifiers.get("canonical_id")
126
+ if not canonical_id and canonical.canonical_match:
127
+ canonical_id = canonical.canonical_match.canonical_id
128
+
129
+ result = {
130
+ "text": statement_entity.text,
131
+ "type": statement_entity.type.value,
132
+ "fqn": fqn,
133
+ "canonical_id": canonical_id,
134
+ }
135
+
136
+ # Add name if available
137
+ if canonical.name:
138
+ result["name"] = canonical.name
139
+
140
+ # Add qualifiers if available
141
+ qualifiers_dict = canonical.qualifiers_dict
142
+ if qualifiers_dict:
143
+ result["qualifiers"] = qualifiers_dict
144
+
145
+ return result
146
+
147
+ def as_dict(self) -> dict:
148
+ """Convert to a simplified dictionary representation."""
149
+ return {
150
+ "subject": self._build_entity_dict(self.subject_canonical, "subject"),
151
+ "predicate": self.statement.predicate,
152
+ "object": self._build_entity_dict(self.object_canonical, "object"),
153
+ "source_text": self.statement.source_text,
154
+ "labels": {
155
+ label.label_type: label.label_value
156
+ for label in self.labels
157
+ },
158
+ "taxonomy": [
159
+ {
160
+ "category": t.category,
161
+ "label": t.label,
162
+ "confidence": t.confidence,
163
+ }
164
+ for t in self.taxonomy_results
165
+ ],
166
+ "document_id": self.document_id,
167
+ "page_number": self.page_number,
168
+ "citation": self.citation,
169
+ }
170
+
171
+ class Config:
172
+ frozen = False # Allow modification during pipeline stages
173
+
174
+
175
+ class TaxonomyResult(BaseModel):
176
+ """
177
+ Result of taxonomy classification from Stage 6.
178
+
179
+ Represents a classification of a statement against a taxonomy,
180
+ typically with a category (top-level) and label (specific topic).
181
+ """
182
+ taxonomy_name: str = Field(
183
+ ...,
184
+ description="Name of the taxonomy (e.g., 'esg_topics', 'industry_codes')"
185
+ )
186
+ category: str = Field(
187
+ ...,
188
+ description="Top-level category (e.g., 'environment', 'governance')"
189
+ )
190
+ label: str = Field(
191
+ ...,
192
+ description="Specific label within the category (e.g., 'carbon emissions')"
193
+ )
194
+ label_id: Optional[int] = Field(
195
+ None,
196
+ description="Numeric ID for reproducibility"
197
+ )
198
+ confidence: float = Field(
199
+ default=1.0,
200
+ ge=0.0,
201
+ le=1.0,
202
+ description="Classification confidence"
203
+ )
204
+ classifier: Optional[str] = Field(
205
+ None,
206
+ description="Name of the taxonomy plugin that produced this result"
207
+ )
208
+ metadata: dict[str, Any] = Field(
209
+ default_factory=dict,
210
+ description="Additional metadata (e.g., runner-up labels, scores)"
211
+ )
212
+
213
+ @property
214
+ def full_label(self) -> str:
215
+ """Get the full label in category:label format."""
216
+ return f"{self.category}:{self.label}"
217
+
218
+ def is_high_confidence(self, threshold: float = 0.7) -> bool:
219
+ """Check if this is a high-confidence classification."""
220
+ return self.confidence >= threshold
@@ -0,0 +1,139 @@
1
+ """
2
+ Qualifier models for the extraction pipeline.
3
+
4
+ EntityQualifiers: Semantic qualifiers and external identifiers
5
+ QualifiedEntity: Entity with qualification information from Stage 3
6
+ ResolvedRole: Canonical role information from database
7
+ ResolvedOrganization: Canonical organization information from database
8
+ """
9
+
10
+ from typing import Any, Optional
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+ from .entity import EntityType
15
+
16
+
17
+ class ResolvedRole(BaseModel):
18
+ """
19
+ Resolved/canonical role information for a person.
20
+
21
+ Populated when matching a person against the database,
22
+ capturing the canonical role from Wikidata or other sources.
23
+ """
24
+ canonical_name: str = Field(..., description="Canonical role name (e.g., 'Chief Executive Officer')")
25
+ canonical_id: Optional[str] = Field(None, description="Full canonical ID (e.g., 'wikidata:Q484876')")
26
+ source: str = Field(..., description="Source of resolution (e.g., 'wikidata')")
27
+ source_id: Optional[str] = Field(None, description="ID in the source (e.g., 'Q484876' for Wikidata)")
28
+
29
+
30
+ class ResolvedOrganization(BaseModel):
31
+ """
32
+ Resolved/canonical organization information.
33
+
34
+ Populated when resolving an organization mentioned in context
35
+ against the organization database (GLEIF, SEC, Companies House, Wikidata).
36
+ """
37
+ canonical_name: str = Field(..., description="Canonical organization name")
38
+ canonical_id: str = Field(..., description="Full canonical ID (e.g., 'LEI:549300XYZ', 'SEC-CIK:1234567')")
39
+ source: str = Field(..., description="Source of resolution (e.g., 'gleif', 'sec_edgar', 'wikidata')")
40
+ source_id: str = Field(..., description="ID in the source")
41
+ region: Optional[str] = Field(None, description="Organization's region/jurisdiction")
42
+ match_confidence: float = Field(default=1.0, description="Confidence in the match (0-1)")
43
+ match_details: Optional[dict[str, Any]] = Field(None, description="Additional match details")
44
+
45
+
46
+ class EntityQualifiers(BaseModel):
47
+ """
48
+ Qualifiers that provide context and identifiers for an entity.
49
+
50
+ Populated by Stage 3 (Qualification) plugins such as:
51
+ - PersonQualifierPlugin: Adds role, org for PERSON entities
52
+ - GLEIFQualifierPlugin: Adds LEI for ORG entities
53
+ - CompaniesHouseQualifierPlugin: Adds UK company number
54
+ - SECEdgarQualifierPlugin: Adds SEC CIK, ticker
55
+ """
56
+ # Canonical name from database (for ORG entities)
57
+ legal_name: Optional[str] = Field(None, description="Canonical legal name from database")
58
+
59
+ # Semantic qualifiers (for PERSON entities)
60
+ org: Optional[str] = Field(None, description="Organization/employer name")
61
+ role: Optional[str] = Field(None, description="Job title/position/role")
62
+
63
+ # Location qualifiers
64
+ region: Optional[str] = Field(None, description="State/province/region")
65
+ country: Optional[str] = Field(None, description="Country name or ISO code")
66
+ city: Optional[str] = Field(None, description="City name")
67
+ jurisdiction: Optional[str] = Field(None, description="Legal jurisdiction (e.g., 'UK', 'US-DE')")
68
+
69
+ # External identifiers (keyed by identifier type)
70
+ identifiers: dict[str, str] = Field(
71
+ default_factory=dict,
72
+ description="External identifiers: lei, ch_number, sec_cik, ticker, wikidata_qid, etc."
73
+ )
74
+
75
+ # Resolved canonical information (for PERSON entities)
76
+ resolved_role: Optional[ResolvedRole] = Field(
77
+ None,
78
+ description="Canonical role information from database lookup"
79
+ )
80
+ resolved_org: Optional[ResolvedOrganization] = Field(
81
+ None,
82
+ description="Canonical organization information from database lookup"
83
+ )
84
+
85
+ def has_any_qualifier(self) -> bool:
86
+ """Check if any qualifier or identifier is set."""
87
+ return bool(
88
+ self.legal_name or self.org or self.role or self.region or self.country or
89
+ self.city or self.jurisdiction or self.identifiers or
90
+ self.resolved_role or self.resolved_org
91
+ )
92
+
93
+ def merge_with(self, other: "EntityQualifiers") -> "EntityQualifiers":
94
+ """
95
+ Merge qualifiers from another instance, preferring non-None values.
96
+
97
+ Returns a new EntityQualifiers with merged values.
98
+ """
99
+ merged_identifiers = {**self.identifiers, **other.identifiers}
100
+ return EntityQualifiers(
101
+ legal_name=other.legal_name or self.legal_name,
102
+ org=other.org or self.org,
103
+ role=other.role or self.role,
104
+ region=other.region or self.region,
105
+ country=other.country or self.country,
106
+ city=other.city or self.city,
107
+ jurisdiction=other.jurisdiction or self.jurisdiction,
108
+ identifiers=merged_identifiers,
109
+ resolved_role=other.resolved_role or self.resolved_role,
110
+ resolved_org=other.resolved_org or self.resolved_org,
111
+ )
112
+
113
+
114
+ class QualifiedEntity(BaseModel):
115
+ """
116
+ An entity with qualification information from Stage 3.
117
+
118
+ Links back to the original ExtractedEntity via entity_ref and
119
+ adds qualifiers from various qualification plugins.
120
+ """
121
+ entity_ref: str = Field(..., description="Reference to the original ExtractedEntity")
122
+ original_text: str = Field(..., description="Original entity text")
123
+ entity_type: EntityType = Field(..., description="Entity type")
124
+ qualifiers: EntityQualifiers = Field(
125
+ default_factory=EntityQualifiers,
126
+ description="Qualifiers and identifiers for this entity"
127
+ )
128
+ qualification_sources: list[str] = Field(
129
+ default_factory=list,
130
+ description="List of plugins that contributed qualifiers"
131
+ )
132
+
133
+ def add_qualifier_source(self, source: str) -> None:
134
+ """Add a qualification source to the list."""
135
+ if source not in self.qualification_sources:
136
+ self.qualification_sources.append(source)
137
+
138
+ class Config:
139
+ frozen = False # Allow modification during pipeline stages
@@ -0,0 +1,101 @@
1
+ """
2
+ Statement models for the extraction pipeline.
3
+
4
+ RawTriple: Output of Stage 1 (Splitting)
5
+ PipelineStatement: Output of Stage 2 (Extraction) with refined entities
6
+ """
7
+
8
+ from typing import Optional
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from .entity import ExtractedEntity
13
+
14
+
15
+ class RawTriple(BaseModel):
16
+ """
17
+ A raw triple from Stage 1 (Splitting).
18
+
19
+ Contains the basic text components before entity refinement.
20
+ Generated by T5-Gemma or other splitting plugins.
21
+ """
22
+ subject_text: str = Field(..., description="Raw subject text")
23
+ predicate_text: str = Field(..., description="Raw predicate text")
24
+ object_text: str = Field(..., description="Raw object text")
25
+ source_sentence: str = Field(..., description="The source sentence this triple was extracted from")
26
+ confidence: float = Field(
27
+ default=1.0,
28
+ ge=0.0,
29
+ le=1.0,
30
+ description="Extraction confidence from the splitter"
31
+ )
32
+ # Document tracking fields
33
+ document_id: Optional[str] = Field(
34
+ None,
35
+ description="ID of the source document (for document pipeline)"
36
+ )
37
+ page_number: Optional[int] = Field(
38
+ None,
39
+ description="Page number where this triple was extracted (1-indexed)"
40
+ )
41
+ chunk_index: Optional[int] = Field(
42
+ None,
43
+ description="Index of the chunk this triple was extracted from (0-indexed)"
44
+ )
45
+
46
+ def __str__(self) -> str:
47
+ return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
48
+
49
+ def as_tuple(self) -> tuple[str, str, str]:
50
+ """Return as a simple (subject, predicate, object) tuple."""
51
+ return (self.subject_text, self.predicate_text, self.object_text)
52
+
53
+
54
+ class PipelineStatement(BaseModel):
55
+ """
56
+ A statement with extracted entities from Stage 2 (Extraction).
57
+
58
+ Contains refined subject/object entities with types, spans, and confidence.
59
+ This is the main statement type that flows through stages 2-5.
60
+ """
61
+ subject: ExtractedEntity = Field(..., description="The subject entity")
62
+ predicate: str = Field(..., description="The relationship/predicate text")
63
+ predicate_category: Optional[str] = Field(
64
+ None,
65
+ description="Category/domain of the predicate (e.g., 'ownership_control', 'employment_leadership')"
66
+ )
67
+ object: ExtractedEntity = Field(..., description="The object entity")
68
+ source_text: str = Field(..., description="The source text this statement was extracted from")
69
+ confidence_score: float = Field(
70
+ default=1.0,
71
+ ge=0.0,
72
+ le=1.0,
73
+ description="Overall confidence score for this statement"
74
+ )
75
+ extraction_method: Optional[str] = Field(
76
+ None,
77
+ description="Method used to extract this statement (e.g., 'hybrid', 'gliner', 'model')"
78
+ )
79
+ # Document tracking fields
80
+ document_id: Optional[str] = Field(
81
+ None,
82
+ description="ID of the source document (for document pipeline)"
83
+ )
84
+ page_number: Optional[int] = Field(
85
+ None,
86
+ description="Page number where this statement was extracted (1-indexed)"
87
+ )
88
+ chunk_index: Optional[int] = Field(
89
+ None,
90
+ description="Index of the chunk this statement was extracted from (0-indexed)"
91
+ )
92
+
93
+ def __str__(self) -> str:
94
+ return f"{self.subject.text} --[{self.predicate}]--> {self.object.text}"
95
+
96
+ def as_triple(self) -> tuple[str, str, str]:
97
+ """Return as a simple (subject, predicate, object) tuple."""
98
+ return (self.subject.text, self.predicate, self.object.text)
99
+
100
+ class Config:
101
+ frozen = False # Allow modification during pipeline stages
@@ -28,7 +28,6 @@ class ExtractionMethod(str, Enum):
28
28
  """Method used to extract the triple components."""
29
29
  HYBRID = "hybrid" # Model subject/object + GLiNER2 predicate
30
30
  GLINER = "gliner" # All components from GLiNER2 extraction
31
- SPLIT = "split" # Subject/object from splitting source text around predicate
32
31
  MODEL = "model" # All components from T5-Gemma model (when GLiNER2 disabled)
33
32
 
34
33
 
@@ -305,6 +304,10 @@ class ExtractionOptions(BaseModel):
305
304
  default=None,
306
305
  description="Optional list of predefined predicate types for GLiNER2 relation extraction (e.g., ['works_for', 'founded'])"
307
306
  )
307
+ use_default_predicates: bool = Field(
308
+ default=True,
309
+ description="Use default predicate taxonomy when no custom predicates provided (enables GLiNER2 relation extraction)"
310
+ )
308
311
 
309
312
  # Verbose logging
310
313
  verbose: bool = Field(
@@ -0,0 +1,39 @@
1
+ """
2
+ Pipeline module for the extraction pipeline.
3
+
4
+ This module provides the core pipeline infrastructure:
5
+ - PipelineContext: Data container that flows through all stages
6
+ - PipelineConfig: Configuration for stage/plugin selection
7
+ - PluginRegistry: Registration and discovery of plugins
8
+ - ExtractionPipeline: Main orchestrator class
9
+
10
+ Plugins are auto-loaded when this module is imported.
11
+ """
12
+
13
+ from .context import PipelineContext
14
+ from .config import PipelineConfig
15
+ from .registry import PluginRegistry
16
+ from .orchestrator import ExtractionPipeline
17
+
18
+
19
+ def _load_plugins():
20
+ """Load all plugins by importing their modules."""
21
+ import logging
22
+
23
+ try:
24
+ from ..plugins import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
25
+ # The @PluginRegistry decorators register plugins on import
26
+ except ImportError as e:
27
+ logging.debug(f"Some plugins failed to load: {e}")
28
+
29
+
30
+ # Auto-load plugins on module import
31
+ _load_plugins()
32
+
33
+
34
+ __all__ = [
35
+ "PipelineContext",
36
+ "PipelineConfig",
37
+ "PluginRegistry",
38
+ "ExtractionPipeline",
39
+ ]
@@ -0,0 +1,129 @@
1
+ """
2
+ PipelineConfig - Configuration for stage/plugin selection.
3
+
4
+ Controls which stages are enabled and which plugins to use.
5
+ """
6
+
7
+ from typing import Any, Optional
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class PipelineConfig(BaseModel):
13
+ """
14
+ Configuration for the extraction pipeline.
15
+
16
+ Controls which stages are enabled, which plugins to use,
17
+ and stage-specific options.
18
+ """
19
+ # Stage selection (1=Splitting, 2=Extraction, 3=Qualification, 4=Labeling, 5=Taxonomy)
20
+ enabled_stages: set[int] = Field(
21
+ default={1, 2, 3, 4, 5},
22
+ description="Set of enabled stage numbers (1-5)"
23
+ )
24
+
25
+ # Plugin selection
26
+ enabled_plugins: Optional[set[str]] = Field(
27
+ None,
28
+ description="Set of enabled plugin names (None = all enabled)"
29
+ )
30
+ disabled_plugins: set[str] = Field(
31
+ default_factory=lambda: {
32
+ "mnli_taxonomy_classifier", # Disabled by default - use embedding_taxonomy_classifier instead (faster)
33
+ },
34
+ description="Set of disabled plugin names"
35
+ )
36
+
37
+ # Stage-specific options
38
+ splitter_options: dict[str, Any] = Field(
39
+ default_factory=dict,
40
+ description="Options passed to splitter plugins"
41
+ )
42
+ extractor_options: dict[str, Any] = Field(
43
+ default_factory=dict,
44
+ description="Options passed to extractor plugins"
45
+ )
46
+ qualifier_options: dict[str, Any] = Field(
47
+ default_factory=dict,
48
+ description="Options passed to qualifier plugins (includes canonicalizers)"
49
+ )
50
+ labeler_options: dict[str, Any] = Field(
51
+ default_factory=dict,
52
+ description="Options passed to labeler plugins"
53
+ )
54
+ taxonomy_options: dict[str, Any] = Field(
55
+ default_factory=dict,
56
+ description="Options passed to taxonomy plugins"
57
+ )
58
+
59
+ # General options
60
+ fail_fast: bool = Field(
61
+ default=True,
62
+ description="Stop processing on first error (otherwise continue and collect errors)"
63
+ )
64
+ parallel_processing: bool = Field(
65
+ default=False,
66
+ description="Enable parallel processing where possible"
67
+ )
68
+ max_statements: Optional[int] = Field(
69
+ None,
70
+ description="Maximum number of statements to process (None = unlimited)"
71
+ )
72
+
73
+ def is_stage_enabled(self, stage: int) -> bool:
74
+ """Check if a stage is enabled."""
75
+ return stage in self.enabled_stages
76
+
77
+ def is_plugin_enabled(self, plugin_name: str) -> bool:
78
+ """Check if a plugin is enabled."""
79
+ if plugin_name in self.disabled_plugins:
80
+ return False
81
+ if self.enabled_plugins is None:
82
+ return True
83
+ return plugin_name in self.enabled_plugins
84
+
85
+ @classmethod
86
+ def from_stage_string(cls, stages: str, **kwargs) -> "PipelineConfig":
87
+ """
88
+ Create config from a stage string.
89
+
90
+ Examples:
91
+ "1,2,3" -> stages 1, 2, 3
92
+ "1-3" -> stages 1, 2, 3
93
+ "1-5" -> all stages
94
+ """
95
+ enabled = set()
96
+ for part in stages.split(","):
97
+ part = part.strip()
98
+ if "-" in part:
99
+ start, end = part.split("-", 1)
100
+ for i in range(int(start), int(end) + 1):
101
+ enabled.add(i)
102
+ else:
103
+ enabled.add(int(part))
104
+ return cls(enabled_stages=enabled, **kwargs)
105
+
106
+ @classmethod
107
+ def default(cls) -> "PipelineConfig":
108
+ """Create a default configuration with all stages enabled."""
109
+ return cls()
110
+
111
+ @classmethod
112
+ def minimal(cls) -> "PipelineConfig":
113
+ """Create a minimal configuration with only splitting and extraction."""
114
+ return cls(enabled_stages={1, 2})
115
+
116
+
117
+ # Stage name mapping
118
+ STAGE_NAMES = {
119
+ 1: "splitting",
120
+ 2: "extraction",
121
+ 3: "qualification",
122
+ 4: "labeling",
123
+ 5: "taxonomy",
124
+ }
125
+
126
+
127
+ def get_stage_name(stage: int) -> str:
128
+ """Get the human-readable name for a stage."""
129
+ return STAGE_NAMES.get(stage, f"stage_{stage}")