corp-extractor 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of corp-extractor might be problematic. Click here for more details.
- corp_extractor-0.2.7.dist-info/METADATA +377 -0
- corp_extractor-0.2.7.dist-info/RECORD +11 -0
- corp_extractor-0.2.7.dist-info/WHEEL +4 -0
- corp_extractor-0.2.7.dist-info/entry_points.txt +3 -0
- statement_extractor/__init__.py +110 -0
- statement_extractor/canonicalization.py +196 -0
- statement_extractor/cli.py +215 -0
- statement_extractor/extractor.py +649 -0
- statement_extractor/models.py +284 -0
- statement_extractor/predicate_comparer.py +611 -0
- statement_extractor/scoring.py +419 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Pydantic models for statement extraction results."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EntityType(str, Enum):
|
|
11
|
+
"""Supported entity types for subjects and objects."""
|
|
12
|
+
ORG = "ORG"
|
|
13
|
+
PERSON = "PERSON"
|
|
14
|
+
GPE = "GPE" # Geopolitical entity (countries, cities, states)
|
|
15
|
+
LOC = "LOC" # Non-GPE locations
|
|
16
|
+
PRODUCT = "PRODUCT"
|
|
17
|
+
EVENT = "EVENT"
|
|
18
|
+
WORK_OF_ART = "WORK_OF_ART"
|
|
19
|
+
LAW = "LAW"
|
|
20
|
+
DATE = "DATE"
|
|
21
|
+
MONEY = "MONEY"
|
|
22
|
+
PERCENT = "PERCENT"
|
|
23
|
+
QUANTITY = "QUANTITY"
|
|
24
|
+
UNKNOWN = "UNKNOWN"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Entity(BaseModel):
|
|
28
|
+
"""An entity (subject or object) with its text and type."""
|
|
29
|
+
text: str = Field(..., description="The entity text")
|
|
30
|
+
type: EntityType = Field(default=EntityType.UNKNOWN, description="The entity type")
|
|
31
|
+
|
|
32
|
+
def __str__(self) -> str:
|
|
33
|
+
return f"{self.text} ({self.type.value})"
|
|
34
|
+
|
|
35
|
+
def merge_type_from(self, other: "Entity") -> "Entity":
|
|
36
|
+
"""
|
|
37
|
+
Return a new Entity with the more specific type.
|
|
38
|
+
|
|
39
|
+
If this entity has UNKNOWN type and other has a specific type,
|
|
40
|
+
returns a new entity with this text but other's type.
|
|
41
|
+
Otherwise returns self unchanged.
|
|
42
|
+
"""
|
|
43
|
+
if self.type == EntityType.UNKNOWN and other.type != EntityType.UNKNOWN:
|
|
44
|
+
return Entity(text=self.text, type=other.type)
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Statement(BaseModel):
|
|
49
|
+
"""A single extracted statement (subject-predicate-object triple)."""
|
|
50
|
+
subject: Entity = Field(..., description="The subject entity")
|
|
51
|
+
predicate: str = Field(..., description="The relationship/predicate")
|
|
52
|
+
object: Entity = Field(..., description="The object entity")
|
|
53
|
+
source_text: Optional[str] = Field(None, description="The original text this statement was extracted from")
|
|
54
|
+
|
|
55
|
+
# Quality scoring fields
|
|
56
|
+
confidence_score: Optional[float] = Field(
|
|
57
|
+
None,
|
|
58
|
+
ge=0.0,
|
|
59
|
+
le=1.0,
|
|
60
|
+
description="Groundedness score (0-1) indicating how well the triple is supported by source text"
|
|
61
|
+
)
|
|
62
|
+
evidence_span: Optional[tuple[int, int]] = Field(
|
|
63
|
+
None,
|
|
64
|
+
description="Character offsets (start, end) in source text where this triple is grounded"
|
|
65
|
+
)
|
|
66
|
+
canonical_predicate: Optional[str] = Field(
|
|
67
|
+
None,
|
|
68
|
+
description="Canonical form of the predicate if taxonomy matching was used"
|
|
69
|
+
)
|
|
70
|
+
was_reversed: bool = Field(
|
|
71
|
+
default=False,
|
|
72
|
+
description="True if subject/object were swapped during reversal detection"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def __str__(self) -> str:
|
|
76
|
+
return f"{self.subject.text} -- {self.predicate} --> {self.object.text}"
|
|
77
|
+
|
|
78
|
+
def as_triple(self) -> tuple[str, str, str]:
|
|
79
|
+
"""Return as a simple (subject, predicate, object) tuple."""
|
|
80
|
+
return (self.subject.text, self.predicate, self.object.text)
|
|
81
|
+
|
|
82
|
+
def merge_entity_types_from(self, other: "Statement") -> "Statement":
|
|
83
|
+
"""
|
|
84
|
+
Return a new Statement with more specific entity types merged from other.
|
|
85
|
+
|
|
86
|
+
If this statement has UNKNOWN entity types and other has specific types,
|
|
87
|
+
the returned statement will use the specific types from other.
|
|
88
|
+
All other fields come from self.
|
|
89
|
+
"""
|
|
90
|
+
merged_subject = self.subject.merge_type_from(other.subject)
|
|
91
|
+
merged_object = self.object.merge_type_from(other.object)
|
|
92
|
+
|
|
93
|
+
# Only create new statement if something changed
|
|
94
|
+
if merged_subject is self.subject and merged_object is self.object:
|
|
95
|
+
return self
|
|
96
|
+
|
|
97
|
+
return Statement(
|
|
98
|
+
subject=merged_subject,
|
|
99
|
+
object=merged_object,
|
|
100
|
+
predicate=self.predicate,
|
|
101
|
+
source_text=self.source_text,
|
|
102
|
+
confidence_score=self.confidence_score,
|
|
103
|
+
evidence_span=self.evidence_span,
|
|
104
|
+
canonical_predicate=self.canonical_predicate,
|
|
105
|
+
was_reversed=self.was_reversed,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def reversed(self) -> "Statement":
|
|
109
|
+
"""
|
|
110
|
+
Return a new Statement with subject and object swapped.
|
|
111
|
+
|
|
112
|
+
Sets was_reversed=True to indicate the swap occurred.
|
|
113
|
+
"""
|
|
114
|
+
return Statement(
|
|
115
|
+
subject=self.object,
|
|
116
|
+
object=self.subject,
|
|
117
|
+
predicate=self.predicate,
|
|
118
|
+
source_text=self.source_text,
|
|
119
|
+
confidence_score=self.confidence_score,
|
|
120
|
+
evidence_span=self.evidence_span,
|
|
121
|
+
canonical_predicate=self.canonical_predicate,
|
|
122
|
+
was_reversed=True,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ExtractionResult(BaseModel):
|
|
127
|
+
"""The result of statement extraction from text."""
|
|
128
|
+
statements: list[Statement] = Field(default_factory=list, description="List of extracted statements")
|
|
129
|
+
source_text: Optional[str] = Field(None, description="The original input text")
|
|
130
|
+
|
|
131
|
+
def __len__(self) -> int:
|
|
132
|
+
return len(self.statements)
|
|
133
|
+
|
|
134
|
+
def __iter__(self):
|
|
135
|
+
return iter(self.statements)
|
|
136
|
+
|
|
137
|
+
def to_triples(self) -> list[tuple[str, str, str]]:
|
|
138
|
+
"""Return all statements as simple (subject, predicate, object) tuples."""
|
|
139
|
+
return [stmt.as_triple() for stmt in self.statements]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# =============================================================================
|
|
143
|
+
# Predicate Taxonomy & Comparison Configuration
|
|
144
|
+
# =============================================================================
|
|
145
|
+
|
|
146
|
+
class PredicateMatch(BaseModel):
|
|
147
|
+
"""Result of matching a predicate to a canonical form."""
|
|
148
|
+
original: str = Field(..., description="The original extracted predicate")
|
|
149
|
+
canonical: Optional[str] = Field(None, description="Matched canonical predicate, if any")
|
|
150
|
+
similarity: float = Field(default=0.0, ge=0.0, le=1.0, description="Cosine similarity score")
|
|
151
|
+
matched: bool = Field(default=False, description="Whether a canonical match was found above threshold")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class PredicateTaxonomy(BaseModel):
|
|
155
|
+
"""A taxonomy of canonical predicates for normalization."""
|
|
156
|
+
predicates: list[str] = Field(..., description="List of canonical predicate forms")
|
|
157
|
+
name: Optional[str] = Field(None, description="Optional taxonomy name for identification")
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def from_file(cls, path: str | Path) -> "PredicateTaxonomy":
|
|
161
|
+
"""Load taxonomy from a file (one predicate per line)."""
|
|
162
|
+
with open(path, "r") as f:
|
|
163
|
+
predicates = [line.strip() for line in f if line.strip() and not line.startswith("#")]
|
|
164
|
+
return cls(predicates=predicates)
|
|
165
|
+
|
|
166
|
+
@classmethod
|
|
167
|
+
def from_list(cls, predicates: list[str], name: Optional[str] = None) -> "PredicateTaxonomy":
|
|
168
|
+
"""Create taxonomy from a list of predicates."""
|
|
169
|
+
return cls(predicates=predicates, name=name)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class PredicateComparisonConfig(BaseModel):
|
|
173
|
+
"""Configuration for embedding-based predicate comparison."""
|
|
174
|
+
embedding_model: str = Field(
|
|
175
|
+
default="sentence-transformers/paraphrase-MiniLM-L6-v2",
|
|
176
|
+
description="Sentence-transformers model ID for computing embeddings"
|
|
177
|
+
)
|
|
178
|
+
similarity_threshold: float = Field(
|
|
179
|
+
default=0.5,
|
|
180
|
+
ge=0.0,
|
|
181
|
+
le=1.0,
|
|
182
|
+
description="Minimum cosine similarity to consider a taxonomy match"
|
|
183
|
+
)
|
|
184
|
+
dedup_threshold: float = Field(
|
|
185
|
+
default=0.65,
|
|
186
|
+
ge=0.0,
|
|
187
|
+
le=1.0,
|
|
188
|
+
description="Minimum similarity to consider predicates duplicates"
|
|
189
|
+
)
|
|
190
|
+
normalize_text: bool = Field(
|
|
191
|
+
default=True,
|
|
192
|
+
description="Lowercase and strip predicates before embedding"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# =============================================================================
|
|
197
|
+
# Scoring Configuration
|
|
198
|
+
# =============================================================================
|
|
199
|
+
|
|
200
|
+
class ScoringConfig(BaseModel):
|
|
201
|
+
"""Configuration for beam scoring and triple quality assessment."""
|
|
202
|
+
quality_weight: float = Field(
|
|
203
|
+
default=1.0,
|
|
204
|
+
ge=0.0,
|
|
205
|
+
description="Weight for groundedness/quality scores in beam selection"
|
|
206
|
+
)
|
|
207
|
+
coverage_weight: float = Field(
|
|
208
|
+
default=0.5,
|
|
209
|
+
ge=0.0,
|
|
210
|
+
description="Weight (β) for coverage bonus - how much source text is explained"
|
|
211
|
+
)
|
|
212
|
+
redundancy_penalty: float = Field(
|
|
213
|
+
default=0.3,
|
|
214
|
+
ge=0.0,
|
|
215
|
+
description="Penalty (γ) for redundant/near-duplicate triples"
|
|
216
|
+
)
|
|
217
|
+
length_penalty: float = Field(
|
|
218
|
+
default=0.1,
|
|
219
|
+
ge=0.0,
|
|
220
|
+
description="Penalty (δ) for verbosity - discourages overly long outputs"
|
|
221
|
+
)
|
|
222
|
+
min_confidence: float = Field(
|
|
223
|
+
default=0.0,
|
|
224
|
+
ge=0.0,
|
|
225
|
+
le=1.0,
|
|
226
|
+
description="Minimum confidence score to keep a triple (0=recall, 0.5=balanced, 0.8=precision)"
|
|
227
|
+
)
|
|
228
|
+
merge_top_n: int = Field(
|
|
229
|
+
default=3,
|
|
230
|
+
ge=1,
|
|
231
|
+
le=10,
|
|
232
|
+
description="Number of top beams to merge when merge_beams=True"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# =============================================================================
|
|
237
|
+
# Extraction Options
|
|
238
|
+
# =============================================================================
|
|
239
|
+
|
|
240
|
+
class ExtractionOptions(BaseModel):
|
|
241
|
+
"""Options for controlling the extraction process."""
|
|
242
|
+
|
|
243
|
+
# Beam search parameters
|
|
244
|
+
num_beams: int = Field(default=4, ge=1, le=16, description="Number of beams for diverse beam search")
|
|
245
|
+
diversity_penalty: float = Field(default=1.0, ge=0.0, description="Penalty for beam diversity")
|
|
246
|
+
max_new_tokens: int = Field(default=2048, ge=128, le=8192, description="Maximum tokens to generate")
|
|
247
|
+
min_statement_ratio: float = Field(default=1.0, ge=0.0, description="Minimum statements per sentence ratio")
|
|
248
|
+
max_attempts: int = Field(default=3, ge=1, le=10, description="Maximum extraction retry attempts")
|
|
249
|
+
deduplicate: bool = Field(default=True, description="Remove duplicate statements")
|
|
250
|
+
|
|
251
|
+
# Predicate taxonomy & comparison
|
|
252
|
+
predicate_taxonomy: Optional[PredicateTaxonomy] = Field(
|
|
253
|
+
None,
|
|
254
|
+
description="Optional canonical predicate taxonomy for normalization"
|
|
255
|
+
)
|
|
256
|
+
predicate_config: Optional[PredicateComparisonConfig] = Field(
|
|
257
|
+
None,
|
|
258
|
+
description="Configuration for predicate comparison (uses defaults if not provided)"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Scoring configuration
|
|
262
|
+
scoring_config: Optional[ScoringConfig] = Field(
|
|
263
|
+
None,
|
|
264
|
+
description="Configuration for quality scoring and beam selection"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Pluggable canonicalization function
|
|
268
|
+
entity_canonicalizer: Optional[Callable[[str], str]] = Field(
|
|
269
|
+
None,
|
|
270
|
+
description="Custom function to canonicalize entity text for deduplication"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Mode flags (defaults favor quality)
|
|
274
|
+
merge_beams: bool = Field(
|
|
275
|
+
default=True,
|
|
276
|
+
description="Merge top-N beams instead of selecting single best beam"
|
|
277
|
+
)
|
|
278
|
+
embedding_dedup: bool = Field(
|
|
279
|
+
default=True,
|
|
280
|
+
description="Use embedding similarity for predicate deduplication"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
class Config:
|
|
284
|
+
arbitrary_types_allowed = True # Allow Callable type
|