corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
- corp_extractor-0.5.0.dist-info/RECORD +55 -0
- statement_extractor/__init__.py +9 -0
- statement_extractor/cli.py +460 -21
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +1182 -0
- statement_extractor/extractor.py +32 -47
- statement_extractor/gliner_extraction.py +218 -0
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +74 -0
- statement_extractor/models/canonical.py +139 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +191 -0
- statement_extractor/models/qualifiers.py +91 -0
- statement_extractor/models/statement.py +75 -0
- statement_extractor/models.py +15 -6
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +134 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +447 -0
- statement_extractor/pipeline/registry.py +297 -0
- statement_extractor/plugins/__init__.py +43 -0
- statement_extractor/plugins/base.py +446 -0
- statement_extractor/plugins/canonicalizers/__init__.py +17 -0
- statement_extractor/plugins/canonicalizers/base.py +9 -0
- statement_extractor/plugins/canonicalizers/location.py +219 -0
- statement_extractor/plugins/canonicalizers/organization.py +230 -0
- statement_extractor/plugins/canonicalizers/person.py +242 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +536 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +373 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
- statement_extractor/plugins/qualifiers/__init__.py +19 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +174 -0
- statement_extractor/plugins/qualifiers/gleif.py +186 -0
- statement_extractor/plugins/qualifiers/person.py +221 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +188 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +337 -0
- statement_extractor/plugins/taxonomy/mnli.py +279 -0
- statement_extractor/scoring.py +17 -69
- corp_extractor-0.3.0.dist-info/RECORD +0 -12
- statement_extractor/spacy_extraction.py +0 -386
- {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Canonical models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
CanonicalMatch: Result of matching to a canonical form
|
|
5
|
+
CanonicalEntity: Entity with canonical form from Stage 4
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from .qualifiers import QualifiedEntity
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CanonicalMatch(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Result of matching an entity to its canonical form in Stage 4.
|
|
18
|
+
|
|
19
|
+
Contains information about how the match was made and confidence level.
|
|
20
|
+
"""
|
|
21
|
+
canonical_id: Optional[str] = Field(
|
|
22
|
+
None,
|
|
23
|
+
description="ID in canonical database (e.g., LEI, Wikidata QID)"
|
|
24
|
+
)
|
|
25
|
+
canonical_name: Optional[str] = Field(
|
|
26
|
+
None,
|
|
27
|
+
description="Canonical name/label"
|
|
28
|
+
)
|
|
29
|
+
match_method: str = Field(
|
|
30
|
+
...,
|
|
31
|
+
description="How the match was made: 'identifier', 'name_exact', 'name_fuzzy', 'llm_verified'"
|
|
32
|
+
)
|
|
33
|
+
match_confidence: float = Field(
|
|
34
|
+
default=1.0,
|
|
35
|
+
ge=0.0,
|
|
36
|
+
le=1.0,
|
|
37
|
+
description="Confidence in the canonical match"
|
|
38
|
+
)
|
|
39
|
+
match_details: Optional[dict] = Field(
|
|
40
|
+
None,
|
|
41
|
+
description="Additional details about the match (e.g., fuzzy score, LLM reasoning)"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def is_high_confidence(self, threshold: float = 0.85) -> bool:
|
|
45
|
+
"""Check if this is a high-confidence match."""
|
|
46
|
+
return self.match_confidence >= threshold
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CanonicalEntity(BaseModel):
|
|
50
|
+
"""
|
|
51
|
+
An entity with canonical form from Stage 4 (Canonicalization).
|
|
52
|
+
|
|
53
|
+
Contains the qualified entity plus its canonical match (if found)
|
|
54
|
+
and a fully qualified name (FQN) for display.
|
|
55
|
+
"""
|
|
56
|
+
entity_ref: str = Field(..., description="Reference to the original ExtractedEntity")
|
|
57
|
+
qualified_entity: QualifiedEntity = Field(
|
|
58
|
+
...,
|
|
59
|
+
description="The qualified entity from Stage 3"
|
|
60
|
+
)
|
|
61
|
+
canonical_match: Optional[CanonicalMatch] = Field(
|
|
62
|
+
None,
|
|
63
|
+
description="Canonical match if found"
|
|
64
|
+
)
|
|
65
|
+
fqn: str = Field(
|
|
66
|
+
...,
|
|
67
|
+
description="Fully qualified name, e.g., 'Tim Cook (CEO, Apple Inc)'"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_qualified(
|
|
72
|
+
cls,
|
|
73
|
+
qualified: QualifiedEntity,
|
|
74
|
+
canonical_match: Optional[CanonicalMatch] = None,
|
|
75
|
+
fqn: Optional[str] = None,
|
|
76
|
+
) -> "CanonicalEntity":
|
|
77
|
+
"""Create a CanonicalEntity from a QualifiedEntity."""
|
|
78
|
+
if fqn is None:
|
|
79
|
+
# Generate default FQN from qualifiers
|
|
80
|
+
fqn = cls._generate_fqn(qualified, canonical_match)
|
|
81
|
+
|
|
82
|
+
return cls(
|
|
83
|
+
entity_ref=qualified.entity_ref,
|
|
84
|
+
qualified_entity=qualified,
|
|
85
|
+
canonical_match=canonical_match,
|
|
86
|
+
fqn=fqn,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _generate_fqn(
|
|
91
|
+
qualified: QualifiedEntity,
|
|
92
|
+
canonical_match: Optional[CanonicalMatch] = None
|
|
93
|
+
) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Generate a fully qualified name from qualifiers.
|
|
96
|
+
|
|
97
|
+
Examples:
|
|
98
|
+
- PERSON with role+org: "Tim Cook (CEO, Apple Inc)"
|
|
99
|
+
- ORG with canonical: "Apple Inc (AAPL)"
|
|
100
|
+
- PERSON with no qualifiers: "Tim Cook"
|
|
101
|
+
"""
|
|
102
|
+
# Use canonical name if available, otherwise fall back to original text
|
|
103
|
+
if canonical_match and canonical_match.canonical_name:
|
|
104
|
+
base_name = canonical_match.canonical_name
|
|
105
|
+
else:
|
|
106
|
+
base_name = qualified.original_text
|
|
107
|
+
|
|
108
|
+
qualifiers = qualified.qualifiers
|
|
109
|
+
parts = []
|
|
110
|
+
seen = set() # Track seen values to avoid duplicates
|
|
111
|
+
|
|
112
|
+
def add_part(value: str) -> None:
|
|
113
|
+
"""Add a part if not already seen (case-insensitive)."""
|
|
114
|
+
if value and value.lower() not in seen:
|
|
115
|
+
parts.append(value)
|
|
116
|
+
seen.add(value.lower())
|
|
117
|
+
|
|
118
|
+
# Add role for PERSON entities
|
|
119
|
+
if qualifiers.role:
|
|
120
|
+
add_part(qualifiers.role)
|
|
121
|
+
|
|
122
|
+
# Add organization for PERSON entities
|
|
123
|
+
if qualifiers.org:
|
|
124
|
+
add_part(qualifiers.org)
|
|
125
|
+
|
|
126
|
+
# Add ticker for ORG entities
|
|
127
|
+
if "ticker" in qualifiers.identifiers:
|
|
128
|
+
add_part(qualifiers.identifiers["ticker"])
|
|
129
|
+
|
|
130
|
+
# Add jurisdiction if relevant
|
|
131
|
+
if qualifiers.jurisdiction and not qualifiers.org:
|
|
132
|
+
add_part(qualifiers.jurisdiction)
|
|
133
|
+
|
|
134
|
+
if parts:
|
|
135
|
+
return f"{base_name} ({', '.join(parts)})"
|
|
136
|
+
return base_name
|
|
137
|
+
|
|
138
|
+
class Config:
|
|
139
|
+
frozen = False # Allow modification during pipeline stages
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
ExtractedEntity represents entities identified during extraction with
|
|
5
|
+
confidence scores and span information.
|
|
6
|
+
|
|
7
|
+
Note: EntityType is imported from the original models.py for consistency.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Optional, TYPE_CHECKING
|
|
11
|
+
import uuid
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
# Import EntityType from parent module to avoid duplication
|
|
16
|
+
# This will be populated by __init__.py which loads from old models.py
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from enum import Enum
|
|
19
|
+
|
|
20
|
+
class EntityType(str, Enum):
|
|
21
|
+
"""Supported entity types for subjects and objects."""
|
|
22
|
+
ORG = "ORG"
|
|
23
|
+
PERSON = "PERSON"
|
|
24
|
+
GPE = "GPE"
|
|
25
|
+
LOC = "LOC"
|
|
26
|
+
PRODUCT = "PRODUCT"
|
|
27
|
+
EVENT = "EVENT"
|
|
28
|
+
WORK_OF_ART = "WORK_OF_ART"
|
|
29
|
+
LAW = "LAW"
|
|
30
|
+
DATE = "DATE"
|
|
31
|
+
MONEY = "MONEY"
|
|
32
|
+
PERCENT = "PERCENT"
|
|
33
|
+
QUANTITY = "QUANTITY"
|
|
34
|
+
UNKNOWN = "UNKNOWN"
|
|
35
|
+
else:
|
|
36
|
+
# At runtime, we need to import it from somewhere
|
|
37
|
+
# Try the old models.py location first
|
|
38
|
+
try:
|
|
39
|
+
import importlib.util
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
_models_py_path = Path(__file__).parent.parent / "models.py"
|
|
42
|
+
_spec = importlib.util.spec_from_file_location("_old_models", _models_py_path)
|
|
43
|
+
_old_models = importlib.util.module_from_spec(_spec)
|
|
44
|
+
_spec.loader.exec_module(_old_models)
|
|
45
|
+
EntityType = _old_models.EntityType
|
|
46
|
+
except Exception:
|
|
47
|
+
# Fallback to defining it here
|
|
48
|
+
from enum import Enum
|
|
49
|
+
|
|
50
|
+
class EntityType(str, Enum):
|
|
51
|
+
"""Supported entity types for subjects and objects."""
|
|
52
|
+
ORG = "ORG"
|
|
53
|
+
PERSON = "PERSON"
|
|
54
|
+
GPE = "GPE"
|
|
55
|
+
LOC = "LOC"
|
|
56
|
+
PRODUCT = "PRODUCT"
|
|
57
|
+
EVENT = "EVENT"
|
|
58
|
+
WORK_OF_ART = "WORK_OF_ART"
|
|
59
|
+
LAW = "LAW"
|
|
60
|
+
DATE = "DATE"
|
|
61
|
+
MONEY = "MONEY"
|
|
62
|
+
PERCENT = "PERCENT"
|
|
63
|
+
QUANTITY = "QUANTITY"
|
|
64
|
+
UNKNOWN = "UNKNOWN"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ExtractedEntity(BaseModel):
|
|
68
|
+
"""
|
|
69
|
+
An entity extracted from text with type and confidence information.
|
|
70
|
+
|
|
71
|
+
Used in Stage 2 (Extraction) and flows through subsequent stages.
|
|
72
|
+
"""
|
|
73
|
+
text: str = Field(..., description="The entity text as extracted")
|
|
74
|
+
type: EntityType = Field(default=EntityType.UNKNOWN, description="The entity type")
|
|
75
|
+
span: Optional[tuple[int, int]] = Field(
|
|
76
|
+
None,
|
|
77
|
+
description="Character offsets (start, end) in source text"
|
|
78
|
+
)
|
|
79
|
+
confidence: float = Field(
|
|
80
|
+
default=1.0,
|
|
81
|
+
ge=0.0,
|
|
82
|
+
le=1.0,
|
|
83
|
+
description="Confidence score for this entity extraction"
|
|
84
|
+
)
|
|
85
|
+
entity_ref: str = Field(
|
|
86
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
87
|
+
description="Unique reference ID for tracking this entity through the pipeline"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def __str__(self) -> str:
|
|
91
|
+
return f"{self.text} ({self.type.value})"
|
|
92
|
+
|
|
93
|
+
def __hash__(self) -> int:
|
|
94
|
+
return hash(self.entity_ref)
|
|
95
|
+
|
|
96
|
+
def __eq__(self, other: object) -> bool:
|
|
97
|
+
if not isinstance(other, ExtractedEntity):
|
|
98
|
+
return False
|
|
99
|
+
return self.entity_ref == other.entity_ref
|
|
100
|
+
|
|
101
|
+
class Config:
|
|
102
|
+
frozen = False # Allow modification during pipeline stages
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Label models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
StatementLabel: A label applied to a statement
|
|
5
|
+
LabeledStatement: Final output from Stage 5 with all labels
|
|
6
|
+
TaxonomyResult: Taxonomy classification from Stage 6
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any, Optional, Union
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from .statement import PipelineStatement
|
|
14
|
+
from .canonical import CanonicalEntity
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StatementLabel(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
A label applied to a statement in Stage 5 (Labeling).
|
|
20
|
+
|
|
21
|
+
Labels can represent sentiment, relation type, confidence, or
|
|
22
|
+
any other classification applied by labeler plugins.
|
|
23
|
+
"""
|
|
24
|
+
label_type: str = Field(
|
|
25
|
+
...,
|
|
26
|
+
description="Type of label: 'sentiment', 'relation_type', 'confidence', etc."
|
|
27
|
+
)
|
|
28
|
+
label_value: Union[str, float, bool] = Field(
|
|
29
|
+
...,
|
|
30
|
+
description="The label value (string for classification, float for scores)"
|
|
31
|
+
)
|
|
32
|
+
confidence: float = Field(
|
|
33
|
+
default=1.0,
|
|
34
|
+
ge=0.0,
|
|
35
|
+
le=1.0,
|
|
36
|
+
description="Confidence in this label"
|
|
37
|
+
)
|
|
38
|
+
labeler: Optional[str] = Field(
|
|
39
|
+
None,
|
|
40
|
+
description="Name of the labeler plugin that produced this label"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def is_high_confidence(self, threshold: float = 0.8) -> bool:
|
|
44
|
+
"""Check if this is a high-confidence label."""
|
|
45
|
+
return self.confidence >= threshold
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LabeledStatement(BaseModel):
|
|
49
|
+
"""
|
|
50
|
+
Final output from Stage 5 (Labeling) with taxonomy from Stage 6.
|
|
51
|
+
|
|
52
|
+
Contains the original statement, canonicalized subject and object,
|
|
53
|
+
all labels applied by labeler plugins, and taxonomy classifications.
|
|
54
|
+
"""
|
|
55
|
+
statement: PipelineStatement = Field(
|
|
56
|
+
...,
|
|
57
|
+
description="The original statement from Stage 2"
|
|
58
|
+
)
|
|
59
|
+
subject_canonical: CanonicalEntity = Field(
|
|
60
|
+
...,
|
|
61
|
+
description="Canonicalized subject entity"
|
|
62
|
+
)
|
|
63
|
+
object_canonical: CanonicalEntity = Field(
|
|
64
|
+
...,
|
|
65
|
+
description="Canonicalized object entity"
|
|
66
|
+
)
|
|
67
|
+
labels: list[StatementLabel] = Field(
|
|
68
|
+
default_factory=list,
|
|
69
|
+
description="Labels applied to this statement"
|
|
70
|
+
)
|
|
71
|
+
taxonomy_results: list["TaxonomyResult"] = Field(
|
|
72
|
+
default_factory=list,
|
|
73
|
+
description="Taxonomy classifications from Stage 6"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def get_label(self, label_type: str) -> Optional[StatementLabel]:
|
|
77
|
+
"""Get a label by type, or None if not found."""
|
|
78
|
+
for label in self.labels:
|
|
79
|
+
if label.label_type == label_type:
|
|
80
|
+
return label
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
def get_labels_by_type(self, label_type: str) -> list[StatementLabel]:
|
|
84
|
+
"""Get all labels of a specific type."""
|
|
85
|
+
return [label for label in self.labels if label.label_type == label_type]
|
|
86
|
+
|
|
87
|
+
def add_label(self, label: StatementLabel) -> None:
|
|
88
|
+
"""Add a label to this statement."""
|
|
89
|
+
self.labels.append(label)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def subject_fqn(self) -> str:
|
|
93
|
+
"""Get the subject's fully qualified name."""
|
|
94
|
+
return self.subject_canonical.fqn
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def object_fqn(self) -> str:
|
|
98
|
+
"""Get the object's fully qualified name."""
|
|
99
|
+
return self.object_canonical.fqn
|
|
100
|
+
|
|
101
|
+
def __str__(self) -> str:
|
|
102
|
+
"""Format as FQN triple."""
|
|
103
|
+
return f"{self.subject_fqn} --[{self.statement.predicate}]--> {self.object_fqn}"
|
|
104
|
+
|
|
105
|
+
def as_dict(self) -> dict:
|
|
106
|
+
"""Convert to a simplified dictionary representation."""
|
|
107
|
+
return {
|
|
108
|
+
"subject": {
|
|
109
|
+
"text": self.statement.subject.text,
|
|
110
|
+
"type": self.statement.subject.type.value,
|
|
111
|
+
"fqn": self.subject_fqn,
|
|
112
|
+
"canonical_id": (
|
|
113
|
+
self.subject_canonical.canonical_match.canonical_id
|
|
114
|
+
if self.subject_canonical.canonical_match else None
|
|
115
|
+
),
|
|
116
|
+
},
|
|
117
|
+
"predicate": self.statement.predicate,
|
|
118
|
+
"object": {
|
|
119
|
+
"text": self.statement.object.text,
|
|
120
|
+
"type": self.statement.object.type.value,
|
|
121
|
+
"fqn": self.object_fqn,
|
|
122
|
+
"canonical_id": (
|
|
123
|
+
self.object_canonical.canonical_match.canonical_id
|
|
124
|
+
if self.object_canonical.canonical_match else None
|
|
125
|
+
),
|
|
126
|
+
},
|
|
127
|
+
"source_text": self.statement.source_text,
|
|
128
|
+
"labels": {
|
|
129
|
+
label.label_type: label.label_value
|
|
130
|
+
for label in self.labels
|
|
131
|
+
},
|
|
132
|
+
"taxonomy": [
|
|
133
|
+
{
|
|
134
|
+
"category": t.category,
|
|
135
|
+
"label": t.label,
|
|
136
|
+
"confidence": t.confidence,
|
|
137
|
+
}
|
|
138
|
+
for t in self.taxonomy_results
|
|
139
|
+
],
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
class Config:
|
|
143
|
+
frozen = False # Allow modification during pipeline stages
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class TaxonomyResult(BaseModel):
|
|
147
|
+
"""
|
|
148
|
+
Result of taxonomy classification from Stage 6.
|
|
149
|
+
|
|
150
|
+
Represents a classification of a statement against a taxonomy,
|
|
151
|
+
typically with a category (top-level) and label (specific topic).
|
|
152
|
+
"""
|
|
153
|
+
taxonomy_name: str = Field(
|
|
154
|
+
...,
|
|
155
|
+
description="Name of the taxonomy (e.g., 'esg_topics', 'industry_codes')"
|
|
156
|
+
)
|
|
157
|
+
category: str = Field(
|
|
158
|
+
...,
|
|
159
|
+
description="Top-level category (e.g., 'environment', 'governance')"
|
|
160
|
+
)
|
|
161
|
+
label: str = Field(
|
|
162
|
+
...,
|
|
163
|
+
description="Specific label within the category (e.g., 'carbon emissions')"
|
|
164
|
+
)
|
|
165
|
+
label_id: Optional[int] = Field(
|
|
166
|
+
None,
|
|
167
|
+
description="Numeric ID for reproducibility"
|
|
168
|
+
)
|
|
169
|
+
confidence: float = Field(
|
|
170
|
+
default=1.0,
|
|
171
|
+
ge=0.0,
|
|
172
|
+
le=1.0,
|
|
173
|
+
description="Classification confidence"
|
|
174
|
+
)
|
|
175
|
+
classifier: Optional[str] = Field(
|
|
176
|
+
None,
|
|
177
|
+
description="Name of the taxonomy plugin that produced this result"
|
|
178
|
+
)
|
|
179
|
+
metadata: dict[str, Any] = Field(
|
|
180
|
+
default_factory=dict,
|
|
181
|
+
description="Additional metadata (e.g., runner-up labels, scores)"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def full_label(self) -> str:
|
|
186
|
+
"""Get the full label in category:label format."""
|
|
187
|
+
return f"{self.category}:{self.label}"
|
|
188
|
+
|
|
189
|
+
def is_high_confidence(self, threshold: float = 0.7) -> bool:
|
|
190
|
+
"""Check if this is a high-confidence classification."""
|
|
191
|
+
return self.confidence >= threshold
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Qualifier models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
EntityQualifiers: Semantic qualifiers and external identifiers
|
|
5
|
+
QualifiedEntity: Entity with qualification information from Stage 3
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from .entity import EntityType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EntityQualifiers(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Qualifiers that provide context and identifiers for an entity.
|
|
18
|
+
|
|
19
|
+
Populated by Stage 3 (Qualification) plugins such as:
|
|
20
|
+
- PersonQualifierPlugin: Adds role, org for PERSON entities
|
|
21
|
+
- GLEIFQualifierPlugin: Adds LEI for ORG entities
|
|
22
|
+
- CompaniesHouseQualifierPlugin: Adds UK company number
|
|
23
|
+
- SECEdgarQualifierPlugin: Adds SEC CIK, ticker
|
|
24
|
+
"""
|
|
25
|
+
# Semantic qualifiers (for PERSON entities)
|
|
26
|
+
org: Optional[str] = Field(None, description="Organization/employer name")
|
|
27
|
+
role: Optional[str] = Field(None, description="Job title/position/role")
|
|
28
|
+
|
|
29
|
+
# Location qualifiers
|
|
30
|
+
region: Optional[str] = Field(None, description="State/province/region")
|
|
31
|
+
country: Optional[str] = Field(None, description="Country name or ISO code")
|
|
32
|
+
city: Optional[str] = Field(None, description="City name")
|
|
33
|
+
jurisdiction: Optional[str] = Field(None, description="Legal jurisdiction (e.g., 'UK', 'US-DE')")
|
|
34
|
+
|
|
35
|
+
# External identifiers (keyed by identifier type)
|
|
36
|
+
identifiers: dict[str, str] = Field(
|
|
37
|
+
default_factory=dict,
|
|
38
|
+
description="External identifiers: lei, ch_number, sec_cik, ticker, wikidata_qid, etc."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def has_any_qualifier(self) -> bool:
|
|
42
|
+
"""Check if any qualifier or identifier is set."""
|
|
43
|
+
return bool(
|
|
44
|
+
self.org or self.role or self.region or self.country or
|
|
45
|
+
self.city or self.jurisdiction or self.identifiers
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def merge_with(self, other: "EntityQualifiers") -> "EntityQualifiers":
|
|
49
|
+
"""
|
|
50
|
+
Merge qualifiers from another instance, preferring non-None values.
|
|
51
|
+
|
|
52
|
+
Returns a new EntityQualifiers with merged values.
|
|
53
|
+
"""
|
|
54
|
+
merged_identifiers = {**self.identifiers, **other.identifiers}
|
|
55
|
+
return EntityQualifiers(
|
|
56
|
+
org=other.org or self.org,
|
|
57
|
+
role=other.role or self.role,
|
|
58
|
+
region=other.region or self.region,
|
|
59
|
+
country=other.country or self.country,
|
|
60
|
+
city=other.city or self.city,
|
|
61
|
+
jurisdiction=other.jurisdiction or self.jurisdiction,
|
|
62
|
+
identifiers=merged_identifiers,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class QualifiedEntity(BaseModel):
|
|
67
|
+
"""
|
|
68
|
+
An entity with qualification information from Stage 3.
|
|
69
|
+
|
|
70
|
+
Links back to the original ExtractedEntity via entity_ref and
|
|
71
|
+
adds qualifiers from various qualification plugins.
|
|
72
|
+
"""
|
|
73
|
+
entity_ref: str = Field(..., description="Reference to the original ExtractedEntity")
|
|
74
|
+
original_text: str = Field(..., description="Original entity text")
|
|
75
|
+
entity_type: EntityType = Field(..., description="Entity type")
|
|
76
|
+
qualifiers: EntityQualifiers = Field(
|
|
77
|
+
default_factory=EntityQualifiers,
|
|
78
|
+
description="Qualifiers and identifiers for this entity"
|
|
79
|
+
)
|
|
80
|
+
qualification_sources: list[str] = Field(
|
|
81
|
+
default_factory=list,
|
|
82
|
+
description="List of plugins that contributed qualifiers"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def add_qualifier_source(self, source: str) -> None:
|
|
86
|
+
"""Add a qualification source to the list."""
|
|
87
|
+
if source not in self.qualification_sources:
|
|
88
|
+
self.qualification_sources.append(source)
|
|
89
|
+
|
|
90
|
+
class Config:
|
|
91
|
+
frozen = False # Allow modification during pipeline stages
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Statement models for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
RawTriple: Output of Stage 1 (Splitting)
|
|
5
|
+
PipelineStatement: Output of Stage 2 (Extraction) with refined entities
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from .entity import ExtractedEntity
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RawTriple(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
A raw triple from Stage 1 (Splitting).
|
|
18
|
+
|
|
19
|
+
Contains the basic text components before entity refinement.
|
|
20
|
+
Generated by T5-Gemma or other splitting plugins.
|
|
21
|
+
"""
|
|
22
|
+
subject_text: str = Field(..., description="Raw subject text")
|
|
23
|
+
predicate_text: str = Field(..., description="Raw predicate text")
|
|
24
|
+
object_text: str = Field(..., description="Raw object text")
|
|
25
|
+
source_sentence: str = Field(..., description="The source sentence this triple was extracted from")
|
|
26
|
+
confidence: float = Field(
|
|
27
|
+
default=1.0,
|
|
28
|
+
ge=0.0,
|
|
29
|
+
le=1.0,
|
|
30
|
+
description="Extraction confidence from the splitter"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def __str__(self) -> str:
|
|
34
|
+
return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
|
|
35
|
+
|
|
36
|
+
def as_tuple(self) -> tuple[str, str, str]:
|
|
37
|
+
"""Return as a simple (subject, predicate, object) tuple."""
|
|
38
|
+
return (self.subject_text, self.predicate_text, self.object_text)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class PipelineStatement(BaseModel):
|
|
42
|
+
"""
|
|
43
|
+
A statement with extracted entities from Stage 2 (Extraction).
|
|
44
|
+
|
|
45
|
+
Contains refined subject/object entities with types, spans, and confidence.
|
|
46
|
+
This is the main statement type that flows through stages 2-5.
|
|
47
|
+
"""
|
|
48
|
+
subject: ExtractedEntity = Field(..., description="The subject entity")
|
|
49
|
+
predicate: str = Field(..., description="The relationship/predicate text")
|
|
50
|
+
predicate_category: Optional[str] = Field(
|
|
51
|
+
None,
|
|
52
|
+
description="Category/domain of the predicate (e.g., 'ownership_control', 'employment_leadership')"
|
|
53
|
+
)
|
|
54
|
+
object: ExtractedEntity = Field(..., description="The object entity")
|
|
55
|
+
source_text: str = Field(..., description="The source text this statement was extracted from")
|
|
56
|
+
confidence_score: float = Field(
|
|
57
|
+
default=1.0,
|
|
58
|
+
ge=0.0,
|
|
59
|
+
le=1.0,
|
|
60
|
+
description="Overall confidence score for this statement"
|
|
61
|
+
)
|
|
62
|
+
extraction_method: Optional[str] = Field(
|
|
63
|
+
None,
|
|
64
|
+
description="Method used to extract this statement (e.g., 'hybrid', 'gliner', 'model')"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def __str__(self) -> str:
|
|
68
|
+
return f"{self.subject.text} --[{self.predicate}]--> {self.object.text}"
|
|
69
|
+
|
|
70
|
+
def as_triple(self) -> tuple[str, str, str]:
|
|
71
|
+
"""Return as a simple (subject, predicate, object) tuple."""
|
|
72
|
+
return (self.subject.text, self.predicate, self.object.text)
|
|
73
|
+
|
|
74
|
+
class Config:
|
|
75
|
+
frozen = False # Allow modification during pipeline stages
|
statement_extractor/models.py
CHANGED
|
@@ -26,10 +26,9 @@ class EntityType(str, Enum):
|
|
|
26
26
|
|
|
27
27
|
class ExtractionMethod(str, Enum):
|
|
28
28
|
"""Method used to extract the triple components."""
|
|
29
|
-
HYBRID = "hybrid" # Model subject/object +
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
MODEL = "model" # All components from T5-Gemma model (when spaCy disabled)
|
|
29
|
+
HYBRID = "hybrid" # Model subject/object + GLiNER2 predicate
|
|
30
|
+
GLINER = "gliner" # All components from GLiNER2 extraction
|
|
31
|
+
MODEL = "model" # All components from T5-Gemma model (when GLiNER2 disabled)
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
class Entity(BaseModel):
|
|
@@ -295,9 +294,19 @@ class ExtractionOptions(BaseModel):
|
|
|
295
294
|
default=True,
|
|
296
295
|
description="Use embedding similarity for predicate deduplication"
|
|
297
296
|
)
|
|
298
|
-
|
|
297
|
+
use_gliner_extraction: bool = Field(
|
|
299
298
|
default=True,
|
|
300
|
-
description="Use
|
|
299
|
+
description="Use GLiNER2 for predicate/subject/object extraction (model provides structure + coreference)"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# GLiNER2 predicate configuration
|
|
303
|
+
predicates: Optional[list[str]] = Field(
|
|
304
|
+
default=None,
|
|
305
|
+
description="Optional list of predefined predicate types for GLiNER2 relation extraction (e.g., ['works_for', 'founded'])"
|
|
306
|
+
)
|
|
307
|
+
use_default_predicates: bool = Field(
|
|
308
|
+
default=True,
|
|
309
|
+
description="Use default predicate taxonomy when no custom predicates provided (enables GLiNER2 relation extraction)"
|
|
301
310
|
)
|
|
302
311
|
|
|
303
312
|
# Verbose logging
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline module for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
This module provides the core pipeline infrastructure:
|
|
5
|
+
- PipelineContext: Data container that flows through all stages
|
|
6
|
+
- PipelineConfig: Configuration for stage/plugin selection
|
|
7
|
+
- PluginRegistry: Registration and discovery of plugins
|
|
8
|
+
- ExtractionPipeline: Main orchestrator class
|
|
9
|
+
|
|
10
|
+
Plugins are auto-loaded when this module is imported.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .context import PipelineContext
|
|
14
|
+
from .config import PipelineConfig
|
|
15
|
+
from .registry import PluginRegistry
|
|
16
|
+
from .orchestrator import ExtractionPipeline
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load_plugins():
|
|
20
|
+
"""Load all plugins by importing their modules."""
|
|
21
|
+
import logging
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from ..plugins import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
|
|
25
|
+
# The @PluginRegistry decorators register plugins on import
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
logging.debug(f"Some plugins failed to load: {e}")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Auto-load plugins on module import
|
|
31
|
+
_load_plugins()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"PipelineContext",
|
|
36
|
+
"PipelineConfig",
|
|
37
|
+
"PluginRegistry",
|
|
38
|
+
"ExtractionPipeline",
|
|
39
|
+
]
|