corp-extractor 0.2.11__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/METADATA +140 -33
- corp_extractor-0.4.0.dist-info/RECORD +12 -0
- statement_extractor/__init__.py +3 -1
- statement_extractor/cli.py +20 -0
- statement_extractor/extractor.py +312 -22
- statement_extractor/gliner_extraction.py +288 -0
- statement_extractor/models.py +33 -1
- statement_extractor/scoring.py +108 -90
- corp_extractor-0.2.11.dist-info/RECORD +0 -11
- {corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GLiNER2-based triple extraction.
|
|
3
|
+
|
|
4
|
+
Uses GLiNER2 for relation extraction and entity recognition to extract
|
|
5
|
+
subject, predicate, and object from source text. T5-Gemma model provides
|
|
6
|
+
triple structure and coreference resolution, while GLiNER2 handles
|
|
7
|
+
linguistic analysis.
|
|
8
|
+
|
|
9
|
+
The GLiNER2 model is loaded automatically on first use.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Lazy-loaded GLiNER2 model
|
|
18
|
+
_model = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_model():
|
|
22
|
+
"""
|
|
23
|
+
Lazy-load the GLiNER2 model.
|
|
24
|
+
|
|
25
|
+
Uses the base model (205M parameters) which is CPU-optimized.
|
|
26
|
+
"""
|
|
27
|
+
global _model
|
|
28
|
+
if _model is None:
|
|
29
|
+
from gliner2 import GLiNER2
|
|
30
|
+
|
|
31
|
+
logger.info("Loading GLiNER2 model 'fastino/gliner2-base-v1'...")
|
|
32
|
+
_model = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
|
|
33
|
+
logger.debug("GLiNER2 model loaded")
|
|
34
|
+
return _model
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def extract_triple_from_text(
|
|
38
|
+
source_text: str,
|
|
39
|
+
model_subject: str,
|
|
40
|
+
model_object: str,
|
|
41
|
+
model_predicate: str,
|
|
42
|
+
predicates: Optional[list[str]] = None,
|
|
43
|
+
) -> tuple[str, str, str] | None:
|
|
44
|
+
"""
|
|
45
|
+
Extract subject, predicate, object from source text using GLiNER2.
|
|
46
|
+
|
|
47
|
+
Returns a GLiNER2-based triple that can be added to the candidate pool
|
|
48
|
+
alongside the model's triple. The existing scoring/dedup logic will
|
|
49
|
+
pick the best one.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
source_text: The source sentence to analyze
|
|
53
|
+
model_subject: Subject from T5-Gemma (used for matching and fallback)
|
|
54
|
+
model_object: Object from T5-Gemma (used for matching and fallback)
|
|
55
|
+
model_predicate: Predicate from T5-Gemma (used when no predicates provided)
|
|
56
|
+
predicates: Optional list of predefined relation types to extract
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Tuple of (subject, predicate, object) from GLiNER2, or None if extraction fails
|
|
60
|
+
"""
|
|
61
|
+
if not source_text:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
model = _get_model()
|
|
66
|
+
|
|
67
|
+
if predicates:
|
|
68
|
+
# Use relation extraction with predefined predicates
|
|
69
|
+
result = model.extract_relations(source_text, predicates)
|
|
70
|
+
|
|
71
|
+
# Find best matching relation
|
|
72
|
+
relation_data = result.get("relation_extraction", {})
|
|
73
|
+
best_match = None
|
|
74
|
+
best_confidence = 0.0
|
|
75
|
+
|
|
76
|
+
for rel_type, relations in relation_data.items():
|
|
77
|
+
for rel in relations:
|
|
78
|
+
# Handle both tuple format and dict format
|
|
79
|
+
if isinstance(rel, tuple):
|
|
80
|
+
head, tail = rel
|
|
81
|
+
confidence = 1.0
|
|
82
|
+
else:
|
|
83
|
+
head = rel.get("head", {}).get("text", "")
|
|
84
|
+
tail = rel.get("tail", {}).get("text", "")
|
|
85
|
+
confidence = min(
|
|
86
|
+
rel.get("head", {}).get("confidence", 0.5),
|
|
87
|
+
rel.get("tail", {}).get("confidence", 0.5)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Score based on match with model hints
|
|
91
|
+
score = confidence
|
|
92
|
+
if model_subject.lower() in head.lower() or head.lower() in model_subject.lower():
|
|
93
|
+
score += 0.2
|
|
94
|
+
if model_object.lower() in tail.lower() or tail.lower() in model_object.lower():
|
|
95
|
+
score += 0.2
|
|
96
|
+
|
|
97
|
+
if score > best_confidence:
|
|
98
|
+
best_confidence = score
|
|
99
|
+
best_match = (head, rel_type, tail)
|
|
100
|
+
|
|
101
|
+
if best_match:
|
|
102
|
+
logger.debug(
|
|
103
|
+
f"GLiNER2 extracted (relation): subj='{best_match[0]}', pred='{best_match[1]}', obj='{best_match[2]}'"
|
|
104
|
+
)
|
|
105
|
+
return best_match
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
# No predicate list provided - use GLiNER2 for entity extraction
|
|
109
|
+
# and extract predicate from source text using the model's hint
|
|
110
|
+
|
|
111
|
+
# Extract entities to refine subject/object boundaries
|
|
112
|
+
entity_types = [
|
|
113
|
+
"person", "organization", "company", "location", "city", "country",
|
|
114
|
+
"product", "event", "date", "money", "quantity"
|
|
115
|
+
]
|
|
116
|
+
result = model.extract_entities(source_text, entity_types)
|
|
117
|
+
entities = result.get("entities", {})
|
|
118
|
+
|
|
119
|
+
# Find entities that match model subject/object
|
|
120
|
+
refined_subject = model_subject
|
|
121
|
+
refined_object = model_object
|
|
122
|
+
|
|
123
|
+
for entity_type, entity_list in entities.items():
|
|
124
|
+
for entity in entity_list:
|
|
125
|
+
entity_lower = entity.lower()
|
|
126
|
+
# Check if this entity matches or contains the model's subject/object
|
|
127
|
+
if model_subject.lower() in entity_lower or entity_lower in model_subject.lower():
|
|
128
|
+
# Use the entity text if it's more complete
|
|
129
|
+
if len(entity) >= len(refined_subject):
|
|
130
|
+
refined_subject = entity
|
|
131
|
+
if model_object.lower() in entity_lower or entity_lower in model_object.lower():
|
|
132
|
+
if len(entity) >= len(refined_object):
|
|
133
|
+
refined_object = entity
|
|
134
|
+
|
|
135
|
+
# Extract predicate from source text using predicate split
|
|
136
|
+
predicate_result = extract_triple_by_predicate_split(source_text, model_predicate)
|
|
137
|
+
if predicate_result:
|
|
138
|
+
_, extracted_predicate, _ = predicate_result
|
|
139
|
+
else:
|
|
140
|
+
extracted_predicate = model_predicate
|
|
141
|
+
|
|
142
|
+
if extracted_predicate:
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"GLiNER2 extracted (entity-refined): subj='{refined_subject}', pred='{extracted_predicate}', obj='{refined_object}'"
|
|
145
|
+
)
|
|
146
|
+
return (refined_subject, extracted_predicate, refined_object)
|
|
147
|
+
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
except ImportError as e:
|
|
151
|
+
logger.warning(f"GLiNER2 not installed: {e}")
|
|
152
|
+
return None
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.debug(f"GLiNER2 extraction failed: {e}")
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def extract_triple_by_predicate_split(
|
|
159
|
+
source_text: str,
|
|
160
|
+
predicate: str,
|
|
161
|
+
) -> tuple[str, str, str] | None:
|
|
162
|
+
"""
|
|
163
|
+
Extract subject and object by splitting the source text around the predicate.
|
|
164
|
+
|
|
165
|
+
This is useful when the predicate is known but subject/object boundaries
|
|
166
|
+
are uncertain. Uses the predicate as an anchor point.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
source_text: The source sentence
|
|
170
|
+
predicate: The predicate (verb phrase) to split on
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Tuple of (subject, predicate, object) or None if split fails
|
|
174
|
+
"""
|
|
175
|
+
if not source_text or not predicate:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
# Find the predicate in the source text (case-insensitive)
|
|
179
|
+
source_lower = source_text.lower()
|
|
180
|
+
pred_lower = predicate.lower()
|
|
181
|
+
|
|
182
|
+
pred_pos = source_lower.find(pred_lower)
|
|
183
|
+
if pred_pos < 0:
|
|
184
|
+
# Try finding just the main verb (first word of predicate)
|
|
185
|
+
main_verb = pred_lower.split()[0] if pred_lower.split() else ""
|
|
186
|
+
if main_verb and len(main_verb) > 2:
|
|
187
|
+
pred_pos = source_lower.find(main_verb)
|
|
188
|
+
if pred_pos >= 0:
|
|
189
|
+
# Adjust to use the actual predicate length for splitting
|
|
190
|
+
predicate = main_verb
|
|
191
|
+
|
|
192
|
+
if pred_pos < 0:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
# Extract subject (text before predicate, trimmed)
|
|
196
|
+
subject = source_text[:pred_pos].strip()
|
|
197
|
+
|
|
198
|
+
# Extract object (text after predicate, trimmed)
|
|
199
|
+
pred_end = pred_pos + len(predicate)
|
|
200
|
+
obj = source_text[pred_end:].strip()
|
|
201
|
+
|
|
202
|
+
# Clean up: remove trailing punctuation from object
|
|
203
|
+
obj = obj.rstrip('.,;:!?')
|
|
204
|
+
|
|
205
|
+
# Clean up: remove leading articles/prepositions from object if very short
|
|
206
|
+
obj_words = obj.split()
|
|
207
|
+
if obj_words and obj_words[0].lower() in ('a', 'an', 'the', 'to', 'of', 'for'):
|
|
208
|
+
if len(obj_words) > 1:
|
|
209
|
+
obj = ' '.join(obj_words[1:])
|
|
210
|
+
|
|
211
|
+
# Validate: both subject and object should have meaningful content
|
|
212
|
+
if len(subject) < 2 or len(obj) < 2:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
logger.debug(
|
|
216
|
+
f"Predicate-split extracted: subj='{subject}', pred='{predicate}', obj='{obj}'"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return (subject, predicate, obj)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def score_entity_content(text: str) -> float:
|
|
223
|
+
"""
|
|
224
|
+
Score how entity-like a text is using GLiNER2 entity recognition.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
1.0 - Recognized as a named entity with high confidence
|
|
228
|
+
0.8 - Recognized as an entity with moderate confidence
|
|
229
|
+
0.6 - Partially recognized or contains entity-like content
|
|
230
|
+
0.2 - Not recognized as any entity type
|
|
231
|
+
"""
|
|
232
|
+
if not text or not text.strip():
|
|
233
|
+
return 0.2
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
model = _get_model()
|
|
237
|
+
|
|
238
|
+
# Check if text is recognized as common entity types
|
|
239
|
+
entity_types = [
|
|
240
|
+
"person", "organization", "company", "location", "city", "country",
|
|
241
|
+
"product", "event", "date", "money", "quantity"
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
result = model.extract_entities(
|
|
245
|
+
text,
|
|
246
|
+
entity_types,
|
|
247
|
+
include_confidence=True
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Result format: {'entities': {'person': [{'text': '...', 'confidence': 0.99}], ...}}
|
|
251
|
+
entities_dict = result.get("entities", {})
|
|
252
|
+
|
|
253
|
+
# Find best matching entity across all types
|
|
254
|
+
best_confidence = 0.0
|
|
255
|
+
text_lower = text.lower().strip()
|
|
256
|
+
|
|
257
|
+
for entity_type, entity_list in entities_dict.items():
|
|
258
|
+
for entity in entity_list:
|
|
259
|
+
if isinstance(entity, dict):
|
|
260
|
+
entity_text = entity.get("text", "").lower().strip()
|
|
261
|
+
confidence = entity.get("confidence", 0.5)
|
|
262
|
+
else:
|
|
263
|
+
# Fallback for string format
|
|
264
|
+
entity_text = str(entity).lower().strip()
|
|
265
|
+
confidence = 0.8
|
|
266
|
+
|
|
267
|
+
# Check if entity covers most of the input text
|
|
268
|
+
if entity_text == text_lower:
|
|
269
|
+
# Exact match
|
|
270
|
+
best_confidence = max(best_confidence, confidence)
|
|
271
|
+
elif entity_text in text_lower or text_lower in entity_text:
|
|
272
|
+
# Partial match - reduce confidence
|
|
273
|
+
best_confidence = max(best_confidence, confidence * 0.8)
|
|
274
|
+
|
|
275
|
+
if best_confidence >= 0.9:
|
|
276
|
+
return 1.0
|
|
277
|
+
elif best_confidence >= 0.7:
|
|
278
|
+
return 0.8
|
|
279
|
+
elif best_confidence >= 0.5:
|
|
280
|
+
return 0.6
|
|
281
|
+
elif best_confidence > 0:
|
|
282
|
+
return 0.4
|
|
283
|
+
else:
|
|
284
|
+
return 0.2
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.debug(f"Entity scoring failed for '{text}': {e}")
|
|
288
|
+
return 0.5 # Neutral score on error
|
statement_extractor/models.py
CHANGED
|
@@ -24,6 +24,14 @@ class EntityType(str, Enum):
|
|
|
24
24
|
UNKNOWN = "UNKNOWN"
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
class ExtractionMethod(str, Enum):
|
|
28
|
+
"""Method used to extract the triple components."""
|
|
29
|
+
HYBRID = "hybrid" # Model subject/object + GLiNER2 predicate
|
|
30
|
+
GLINER = "gliner" # All components from GLiNER2 extraction
|
|
31
|
+
SPLIT = "split" # Subject/object from splitting source text around predicate
|
|
32
|
+
MODEL = "model" # All components from T5-Gemma model (when GLiNER2 disabled)
|
|
33
|
+
|
|
34
|
+
|
|
27
35
|
class Entity(BaseModel):
|
|
28
36
|
"""An entity (subject or object) with its text and type."""
|
|
29
37
|
text: str = Field(..., description="The entity text")
|
|
@@ -52,12 +60,18 @@ class Statement(BaseModel):
|
|
|
52
60
|
object: Entity = Field(..., description="The object entity")
|
|
53
61
|
source_text: Optional[str] = Field(None, description="The original text this statement was extracted from")
|
|
54
62
|
|
|
63
|
+
# Extraction method tracking
|
|
64
|
+
extraction_method: ExtractionMethod = Field(
|
|
65
|
+
default=ExtractionMethod.MODEL,
|
|
66
|
+
description="Method used to extract this triple (hybrid, spacy, split, or model)"
|
|
67
|
+
)
|
|
68
|
+
|
|
55
69
|
# Quality scoring fields
|
|
56
70
|
confidence_score: Optional[float] = Field(
|
|
57
71
|
None,
|
|
58
72
|
ge=0.0,
|
|
59
73
|
le=1.0,
|
|
60
|
-
description="
|
|
74
|
+
description="Semantic similarity score (0-1) between source text and reassembled triple"
|
|
61
75
|
)
|
|
62
76
|
evidence_span: Optional[tuple[int, int]] = Field(
|
|
63
77
|
None,
|
|
@@ -99,6 +113,7 @@ class Statement(BaseModel):
|
|
|
99
113
|
object=merged_object,
|
|
100
114
|
predicate=self.predicate,
|
|
101
115
|
source_text=self.source_text,
|
|
116
|
+
extraction_method=self.extraction_method,
|
|
102
117
|
confidence_score=self.confidence_score,
|
|
103
118
|
evidence_span=self.evidence_span,
|
|
104
119
|
canonical_predicate=self.canonical_predicate,
|
|
@@ -116,6 +131,7 @@ class Statement(BaseModel):
|
|
|
116
131
|
object=self.subject,
|
|
117
132
|
predicate=self.predicate,
|
|
118
133
|
source_text=self.source_text,
|
|
134
|
+
extraction_method=self.extraction_method,
|
|
119
135
|
confidence_score=self.confidence_score,
|
|
120
136
|
evidence_span=self.evidence_span,
|
|
121
137
|
canonical_predicate=self.canonical_predicate,
|
|
@@ -279,6 +295,16 @@ class ExtractionOptions(BaseModel):
|
|
|
279
295
|
default=True,
|
|
280
296
|
description="Use embedding similarity for predicate deduplication"
|
|
281
297
|
)
|
|
298
|
+
use_gliner_extraction: bool = Field(
|
|
299
|
+
default=True,
|
|
300
|
+
description="Use GLiNER2 for predicate/subject/object extraction (model provides structure + coreference)"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# GLiNER2 predicate configuration
|
|
304
|
+
predicates: Optional[list[str]] = Field(
|
|
305
|
+
default=None,
|
|
306
|
+
description="Optional list of predefined predicate types for GLiNER2 relation extraction (e.g., ['works_for', 'founded'])"
|
|
307
|
+
)
|
|
282
308
|
|
|
283
309
|
# Verbose logging
|
|
284
310
|
verbose: bool = Field(
|
|
@@ -286,5 +312,11 @@ class ExtractionOptions(BaseModel):
|
|
|
286
312
|
description="Enable verbose logging for debugging"
|
|
287
313
|
)
|
|
288
314
|
|
|
315
|
+
# Triple selection
|
|
316
|
+
all_triples: bool = Field(
|
|
317
|
+
default=False,
|
|
318
|
+
description="Keep all candidate triples instead of selecting the highest-scoring one per source"
|
|
319
|
+
)
|
|
320
|
+
|
|
289
321
|
class Config:
|
|
290
322
|
arbitrary_types_allowed = True # Allow Callable type
|
statement_extractor/scoring.py
CHANGED
|
@@ -2,13 +2,15 @@
|
|
|
2
2
|
Scoring module for statement extraction quality assessment.
|
|
3
3
|
|
|
4
4
|
Provides:
|
|
5
|
-
- TripleScorer: Score individual triples
|
|
5
|
+
- TripleScorer: Score individual triples combining semantic similarity and grammatical accuracy
|
|
6
6
|
- BeamScorer: Score and select/merge beams based on quality metrics
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
import logging
|
|
10
10
|
from typing import Optional
|
|
11
11
|
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
12
14
|
from .models import ScoringConfig, Statement
|
|
13
15
|
|
|
14
16
|
logger = logging.getLogger(__name__)
|
|
@@ -16,62 +18,126 @@ logger = logging.getLogger(__name__)
|
|
|
16
18
|
|
|
17
19
|
class TripleScorer:
|
|
18
20
|
"""
|
|
19
|
-
Score individual triples
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
25
|
-
|
|
21
|
+
Score individual triples combining semantic similarity and entity recognition.
|
|
22
|
+
|
|
23
|
+
The score is a weighted combination of:
|
|
24
|
+
- Semantic similarity (50%): Cosine similarity between source text and reassembled triple
|
|
25
|
+
- Subject entity score (25%): How entity-like the subject is (via GLiNER2)
|
|
26
|
+
- Object entity score (25%): How entity-like the object is (via GLiNER2)
|
|
27
|
+
|
|
28
|
+
Entity scoring (via GLiNER2):
|
|
29
|
+
- Recognized entity with high confidence: 1.0
|
|
30
|
+
- Recognized entity with moderate confidence: 0.8
|
|
31
|
+
- Partially recognized: 0.6
|
|
32
|
+
- Not recognized: 0.2
|
|
26
33
|
"""
|
|
27
34
|
|
|
28
|
-
def __init__(
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
config: Optional[ScoringConfig] = None,
|
|
38
|
+
device: Optional[str] = None,
|
|
39
|
+
):
|
|
29
40
|
self.config = config or ScoringConfig()
|
|
30
41
|
|
|
42
|
+
# Auto-detect device
|
|
43
|
+
if device is None:
|
|
44
|
+
import torch
|
|
45
|
+
if torch.cuda.is_available():
|
|
46
|
+
self.device = "cuda"
|
|
47
|
+
elif torch.backends.mps.is_available():
|
|
48
|
+
self.device = "mps"
|
|
49
|
+
else:
|
|
50
|
+
self.device = "cpu"
|
|
51
|
+
else:
|
|
52
|
+
self.device = device
|
|
53
|
+
|
|
54
|
+
# Lazy-loaded embedding model
|
|
55
|
+
self._model = None
|
|
56
|
+
self._embedding_model_name = "all-MiniLM-L6-v2"
|
|
57
|
+
|
|
58
|
+
def _load_model(self):
|
|
59
|
+
"""Load sentence-transformers model lazily."""
|
|
60
|
+
if self._model is not None:
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
from sentence_transformers import SentenceTransformer
|
|
64
|
+
|
|
65
|
+
logger.debug(f"Loading embedding model: {self._embedding_model_name} on {self.device}")
|
|
66
|
+
self._model = SentenceTransformer(self._embedding_model_name, device=self.device)
|
|
67
|
+
logger.debug(f"Embedding model loaded on {self.device}")
|
|
68
|
+
|
|
69
|
+
def _compute_embeddings(self, texts: list[str]) -> np.ndarray:
|
|
70
|
+
"""Compute embeddings for a list of texts."""
|
|
71
|
+
self._load_model()
|
|
72
|
+
return self._model.encode(texts, convert_to_numpy=True)
|
|
73
|
+
|
|
74
|
+
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
75
|
+
"""Compute cosine similarity between two vectors."""
|
|
76
|
+
dot = np.dot(vec1, vec2)
|
|
77
|
+
norm1 = np.linalg.norm(vec1)
|
|
78
|
+
norm2 = np.linalg.norm(vec2)
|
|
79
|
+
if norm1 == 0 or norm2 == 0:
|
|
80
|
+
return 0.0
|
|
81
|
+
return float(dot / (norm1 * norm2))
|
|
82
|
+
|
|
83
|
+
def _score_noun_content(self, text: str) -> float:
|
|
84
|
+
"""
|
|
85
|
+
Score how entity-like a text is using GLiNER2 entity recognition.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
1.0 - Recognized as a named entity with high confidence
|
|
89
|
+
0.8 - Recognized as an entity with moderate confidence
|
|
90
|
+
0.6 - Partially recognized or contains entity-like content
|
|
91
|
+
0.2 - Not recognized as any entity type
|
|
92
|
+
"""
|
|
93
|
+
if not text or not text.strip():
|
|
94
|
+
return 0.2
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
from .gliner_extraction import score_entity_content
|
|
98
|
+
return score_entity_content(text)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.debug(f"Entity scoring failed for '{text}': {e}")
|
|
101
|
+
return 0.5 # Neutral score on error
|
|
102
|
+
|
|
31
103
|
def score_triple(self, statement: Statement, source_text: str) -> float:
|
|
32
104
|
"""
|
|
33
|
-
Score a triple's
|
|
105
|
+
Score a triple's quality (0-1) combining semantic similarity and grammatical accuracy.
|
|
106
|
+
|
|
107
|
+
The score is a weighted combination of:
|
|
108
|
+
- Semantic similarity (50%): How well the triple captures the source meaning
|
|
109
|
+
- Subject noun score (25%): Grammatical quality of subject
|
|
110
|
+
- Object noun score (25%): Grammatical quality of object
|
|
34
111
|
|
|
35
|
-
Higher scores indicate better
|
|
112
|
+
Higher scores indicate better overall quality.
|
|
36
113
|
"""
|
|
37
|
-
if
|
|
114
|
+
# Use statement's source_text if available, otherwise use provided source_text
|
|
115
|
+
reference_text = statement.source_text or source_text
|
|
116
|
+
if not reference_text:
|
|
38
117
|
logger.debug(f" No source text, returning neutral score 0.5")
|
|
39
118
|
return 0.5 # Neutral score if no source text
|
|
40
119
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
score += 0.3 * (1.0 if object_found else 0.0)
|
|
52
|
-
weights_sum += 0.3
|
|
53
|
-
|
|
54
|
-
# Check predicate has lexical trigger (weight: 0.2)
|
|
55
|
-
predicate_grounded = self._predicate_has_trigger(statement.predicate, source_text)
|
|
56
|
-
score += 0.2 * (1.0 if predicate_grounded else 0.0)
|
|
57
|
-
weights_sum += 0.2
|
|
58
|
-
|
|
59
|
-
# Check proximity - subject and object in same/nearby region (weight: 0.2)
|
|
60
|
-
proximity_score = 0.0
|
|
61
|
-
if subject_found and object_found:
|
|
62
|
-
proximity_score = self._compute_proximity(
|
|
63
|
-
statement.subject.text,
|
|
64
|
-
statement.object.text,
|
|
65
|
-
source_text
|
|
66
|
-
)
|
|
67
|
-
score += 0.2 * proximity_score
|
|
68
|
-
weights_sum += 0.2
|
|
120
|
+
# Reassemble the triple
|
|
121
|
+
reassembled = f"{statement.subject.text} {statement.predicate} {statement.object.text}"
|
|
122
|
+
|
|
123
|
+
# Compute semantic similarity
|
|
124
|
+
embeddings = self._compute_embeddings([reference_text, reassembled])
|
|
125
|
+
semantic_similarity = self._cosine_similarity(embeddings[0], embeddings[1])
|
|
126
|
+
|
|
127
|
+
# Compute grammatical scores for subject and object
|
|
128
|
+
subject_noun_score = self._score_noun_content(statement.subject.text)
|
|
129
|
+
object_noun_score = self._score_noun_content(statement.object.text)
|
|
69
130
|
|
|
70
|
-
|
|
131
|
+
# Weighted combination: 50% semantic, 25% subject, 25% object
|
|
132
|
+
final_score = (
|
|
133
|
+
semantic_similarity * 0.5 +
|
|
134
|
+
subject_noun_score * 0.25 +
|
|
135
|
+
object_noun_score * 0.25
|
|
136
|
+
)
|
|
71
137
|
|
|
72
138
|
logger.debug(
|
|
73
139
|
f" Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
|
|
74
|
-
f"{final_score:.
|
|
140
|
+
f"{final_score:.3f} (semantic={semantic_similarity:.2f}, subj_noun={subject_noun_score:.2f}, obj_noun={object_noun_score:.2f})"
|
|
75
141
|
)
|
|
76
142
|
|
|
77
143
|
return final_score
|
|
@@ -115,54 +181,6 @@ class TripleScorer:
|
|
|
115
181
|
|
|
116
182
|
return None
|
|
117
183
|
|
|
118
|
-
def _text_appears_in(self, text: str, source: str) -> bool:
|
|
119
|
-
"""Check if text appears in source (case-insensitive)."""
|
|
120
|
-
return text.lower() in source.lower()
|
|
121
|
-
|
|
122
|
-
def _predicate_has_trigger(self, predicate: str, source: str) -> bool:
|
|
123
|
-
"""Check if predicate has a lexical trigger in source."""
|
|
124
|
-
# Extract main verb/word from predicate
|
|
125
|
-
words = predicate.lower().split()
|
|
126
|
-
source_lower = source.lower()
|
|
127
|
-
|
|
128
|
-
# Check if any predicate word appears in source
|
|
129
|
-
for word in words:
|
|
130
|
-
if len(word) > 2 and word in source_lower:
|
|
131
|
-
return True
|
|
132
|
-
return False
|
|
133
|
-
|
|
134
|
-
def _compute_proximity(
|
|
135
|
-
self,
|
|
136
|
-
subject_text: str,
|
|
137
|
-
object_text: str,
|
|
138
|
-
source: str
|
|
139
|
-
) -> float:
|
|
140
|
-
"""
|
|
141
|
-
Compute proximity score (0-1) based on distance between subject and object.
|
|
142
|
-
|
|
143
|
-
Returns 1.0 if same sentence, decreasing with distance.
|
|
144
|
-
"""
|
|
145
|
-
source_lower = source.lower()
|
|
146
|
-
subj_pos = source_lower.find(subject_text.lower())
|
|
147
|
-
obj_pos = source_lower.find(object_text.lower())
|
|
148
|
-
|
|
149
|
-
if subj_pos < 0 or obj_pos < 0:
|
|
150
|
-
return 0.0
|
|
151
|
-
|
|
152
|
-
# Check if in same sentence
|
|
153
|
-
start = min(subj_pos, obj_pos)
|
|
154
|
-
end = max(subj_pos, obj_pos)
|
|
155
|
-
region = source[start:end]
|
|
156
|
-
|
|
157
|
-
# If no sentence boundary between them, high proximity
|
|
158
|
-
if '.' not in region and '!' not in region and '?' not in region:
|
|
159
|
-
return 1.0
|
|
160
|
-
|
|
161
|
-
# Otherwise, score decreases with distance
|
|
162
|
-
# Assume ~100 chars per sentence on average
|
|
163
|
-
sentence_distance = region.count('.') + region.count('!') + region.count('?')
|
|
164
|
-
return max(0.0, 1.0 - (sentence_distance * 0.2))
|
|
165
|
-
|
|
166
184
|
def _extend_to_sentence(
|
|
167
185
|
self,
|
|
168
186
|
source: str,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
|
|
2
|
-
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
-
statement_extractor/cli.py,sha256=NIGCpqcnzF42B16RCiSu4kN0RlnVne2ZAT8341Znt1g,8558
|
|
4
|
-
statement_extractor/extractor.py,sha256=r2gcCfZT43Q8STPuzaXmhbjWXTAs4JwMeAtCjQxlsIQ,25870
|
|
5
|
-
statement_extractor/models.py,sha256=IE3TyIiOl2CINPMroQnGT12rSeQFR0bV3y4BJ79wLmI,10877
|
|
6
|
-
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
7
|
-
statement_extractor/scoring.py,sha256=xs0SxrV42QNBULQguU1-HhcCc-HnS-ekbcdx7FqWGVk,15663
|
|
8
|
-
corp_extractor-0.2.11.dist-info/METADATA,sha256=D-fs9i9kn4v5bRAHCHxI3cq_6vosNgDCN7uuYwVZztM,13775
|
|
9
|
-
corp_extractor-0.2.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
-
corp_extractor-0.2.11.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
11
|
-
corp_extractor-0.2.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|