corp-extractor 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/METADATA +115 -22
- corp_extractor-0.3.0.dist-info/RECORD +12 -0
- statement_extractor/__init__.py +3 -1
- statement_extractor/cli.py +41 -1
- statement_extractor/extractor.py +381 -26
- statement_extractor/models.py +33 -1
- statement_extractor/predicate_comparer.py +23 -1
- statement_extractor/scoring.py +189 -97
- statement_extractor/spacy_extraction.py +386 -0
- corp_extractor-0.2.5.dist-info/RECORD +0 -11
- {corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.2.5.dist-info → corp_extractor-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -83,7 +83,12 @@ class PredicateComparer:
|
|
|
83
83
|
# Auto-detect device
|
|
84
84
|
if device is None:
|
|
85
85
|
import torch
|
|
86
|
-
|
|
86
|
+
if torch.cuda.is_available():
|
|
87
|
+
self.device = "cuda"
|
|
88
|
+
elif torch.backends.mps.is_available():
|
|
89
|
+
self.device = "mps"
|
|
90
|
+
else:
|
|
91
|
+
self.device = "cpu"
|
|
87
92
|
else:
|
|
88
93
|
self.device = device
|
|
89
94
|
|
|
@@ -289,6 +294,8 @@ class PredicateComparer:
|
|
|
289
294
|
Returns:
|
|
290
295
|
Deduplicated list of statements (keeps best contextualized match)
|
|
291
296
|
"""
|
|
297
|
+
logger.debug(f"Embedding deduplication: {len(statements)} statements, detect_reversals={detect_reversals}")
|
|
298
|
+
|
|
292
299
|
if len(statements) <= 1:
|
|
293
300
|
return statements
|
|
294
301
|
|
|
@@ -297,27 +304,33 @@ class PredicateComparer:
|
|
|
297
304
|
return entity_canonicalizer(text)
|
|
298
305
|
return text.lower().strip()
|
|
299
306
|
|
|
307
|
+
logger.debug(" Computing predicate embeddings...")
|
|
300
308
|
# Compute all predicate embeddings at once for efficiency
|
|
301
309
|
predicates = [s.predicate for s in statements]
|
|
302
310
|
pred_embeddings = self._compute_embeddings(predicates)
|
|
311
|
+
logger.debug(f" Computed {len(pred_embeddings)} predicate embeddings")
|
|
303
312
|
|
|
313
|
+
logger.debug(" Computing contextualized embeddings (S P O)...")
|
|
304
314
|
# Compute contextualized embeddings: "Subject Predicate Object" for each statement
|
|
305
315
|
contextualized_texts = [
|
|
306
316
|
f"{s.subject.text} {s.predicate} {s.object.text}" for s in statements
|
|
307
317
|
]
|
|
308
318
|
contextualized_embeddings = self._compute_embeddings(contextualized_texts)
|
|
309
319
|
|
|
320
|
+
logger.debug(" Computing reversed embeddings (O P S)...")
|
|
310
321
|
# Compute reversed contextualized embeddings: "Object Predicate Subject"
|
|
311
322
|
reversed_texts = [
|
|
312
323
|
f"{s.object.text} {s.predicate} {s.subject.text}" for s in statements
|
|
313
324
|
]
|
|
314
325
|
reversed_embeddings = self._compute_embeddings(reversed_texts)
|
|
315
326
|
|
|
327
|
+
logger.debug(" Computing source text embeddings...")
|
|
316
328
|
# Compute source text embeddings for scoring which duplicate to keep
|
|
317
329
|
source_embeddings = []
|
|
318
330
|
for stmt in statements:
|
|
319
331
|
source_text = stmt.source_text or f"{stmt.subject.text} {stmt.predicate} {stmt.object.text}"
|
|
320
332
|
source_embeddings.append(self._compute_embeddings([source_text])[0])
|
|
333
|
+
logger.debug(" All embeddings computed, starting comparison loop...")
|
|
321
334
|
|
|
322
335
|
unique_statements: list[Statement] = []
|
|
323
336
|
unique_pred_embeddings: list[np.ndarray] = []
|
|
@@ -358,9 +371,17 @@ class PredicateComparer:
|
|
|
358
371
|
if similarity >= self.config.dedup_threshold:
|
|
359
372
|
duplicate_idx = j
|
|
360
373
|
is_reversed_match = reversed_match and not direct_match
|
|
374
|
+
match_type = "reversed" if is_reversed_match else "direct"
|
|
375
|
+
logger.debug(
|
|
376
|
+
f" [{i}] DUPLICATE of [{unique_indices[j]}] ({match_type}, sim={similarity:.3f}): "
|
|
377
|
+
f"'{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
|
|
378
|
+
)
|
|
361
379
|
break
|
|
362
380
|
|
|
363
381
|
if duplicate_idx is None:
|
|
382
|
+
logger.debug(
|
|
383
|
+
f" [{i}] UNIQUE: '{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
|
|
384
|
+
)
|
|
364
385
|
# Not a duplicate - add to unique list
|
|
365
386
|
unique_statements.append(stmt)
|
|
366
387
|
unique_pred_embeddings.append(pred_embeddings[i])
|
|
@@ -451,6 +472,7 @@ class PredicateComparer:
|
|
|
451
472
|
merged_stmt = existing_stmt.merge_entity_types_from(stmt)
|
|
452
473
|
unique_statements[duplicate_idx] = merged_stmt
|
|
453
474
|
|
|
475
|
+
logger.debug(f" Deduplication complete: {len(statements)} -> {len(unique_statements)} statements")
|
|
454
476
|
return unique_statements
|
|
455
477
|
|
|
456
478
|
def normalize_predicates(
|
statement_extractor/scoring.py
CHANGED
|
@@ -2,67 +2,197 @@
|
|
|
2
2
|
Scoring module for statement extraction quality assessment.
|
|
3
3
|
|
|
4
4
|
Provides:
|
|
5
|
-
- TripleScorer: Score individual triples
|
|
5
|
+
- TripleScorer: Score individual triples combining semantic similarity and grammatical accuracy
|
|
6
6
|
- BeamScorer: Score and select/merge beams based on quality metrics
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
import logging
|
|
9
10
|
from typing import Optional
|
|
10
11
|
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
11
14
|
from .models import ScoringConfig, Statement
|
|
12
15
|
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# Lazy-loaded spaCy model for grammatical analysis
|
|
19
|
+
_nlp = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_nlp():
|
|
23
|
+
"""Lazy-load spaCy model for POS tagging."""
|
|
24
|
+
global _nlp
|
|
25
|
+
if _nlp is None:
|
|
26
|
+
import spacy
|
|
27
|
+
try:
|
|
28
|
+
_nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
|
|
29
|
+
except OSError:
|
|
30
|
+
# Model not found, try to download
|
|
31
|
+
from .spacy_extraction import _download_model
|
|
32
|
+
if _download_model():
|
|
33
|
+
_nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
|
|
34
|
+
else:
|
|
35
|
+
raise
|
|
36
|
+
return _nlp
|
|
37
|
+
|
|
13
38
|
|
|
14
39
|
class TripleScorer:
|
|
15
40
|
"""
|
|
16
|
-
Score individual triples
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
-
|
|
20
|
-
-
|
|
21
|
-
-
|
|
22
|
-
|
|
41
|
+
Score individual triples combining semantic similarity and grammatical accuracy.
|
|
42
|
+
|
|
43
|
+
The score is a weighted combination of:
|
|
44
|
+
- Semantic similarity (50%): Cosine similarity between source text and reassembled triple
|
|
45
|
+
- Subject noun score (25%): How noun-like the subject is
|
|
46
|
+
- Object noun score (25%): How noun-like the object is
|
|
47
|
+
|
|
48
|
+
Noun scoring:
|
|
49
|
+
- Proper noun only (PROPN): 1.0
|
|
50
|
+
- Common noun only (NOUN): 0.8
|
|
51
|
+
- Contains noun + other words: 0.6
|
|
52
|
+
- No noun: 0.2
|
|
23
53
|
"""
|
|
24
54
|
|
|
25
|
-
def __init__(
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
config: Optional[ScoringConfig] = None,
|
|
58
|
+
device: Optional[str] = None,
|
|
59
|
+
):
|
|
26
60
|
self.config = config or ScoringConfig()
|
|
27
61
|
|
|
62
|
+
# Auto-detect device
|
|
63
|
+
if device is None:
|
|
64
|
+
import torch
|
|
65
|
+
if torch.cuda.is_available():
|
|
66
|
+
self.device = "cuda"
|
|
67
|
+
elif torch.backends.mps.is_available():
|
|
68
|
+
self.device = "mps"
|
|
69
|
+
else:
|
|
70
|
+
self.device = "cpu"
|
|
71
|
+
else:
|
|
72
|
+
self.device = device
|
|
73
|
+
|
|
74
|
+
# Lazy-loaded embedding model
|
|
75
|
+
self._model = None
|
|
76
|
+
self._embedding_model_name = "all-MiniLM-L6-v2"
|
|
77
|
+
|
|
78
|
+
def _load_model(self):
|
|
79
|
+
"""Load sentence-transformers model lazily."""
|
|
80
|
+
if self._model is not None:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
from sentence_transformers import SentenceTransformer
|
|
84
|
+
|
|
85
|
+
logger.debug(f"Loading embedding model: {self._embedding_model_name} on {self.device}")
|
|
86
|
+
self._model = SentenceTransformer(self._embedding_model_name, device=self.device)
|
|
87
|
+
logger.debug(f"Embedding model loaded on {self.device}")
|
|
88
|
+
|
|
89
|
+
def _compute_embeddings(self, texts: list[str]) -> np.ndarray:
|
|
90
|
+
"""Compute embeddings for a list of texts."""
|
|
91
|
+
self._load_model()
|
|
92
|
+
return self._model.encode(texts, convert_to_numpy=True)
|
|
93
|
+
|
|
94
|
+
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
95
|
+
"""Compute cosine similarity between two vectors."""
|
|
96
|
+
dot = np.dot(vec1, vec2)
|
|
97
|
+
norm1 = np.linalg.norm(vec1)
|
|
98
|
+
norm2 = np.linalg.norm(vec2)
|
|
99
|
+
if norm1 == 0 or norm2 == 0:
|
|
100
|
+
return 0.0
|
|
101
|
+
return float(dot / (norm1 * norm2))
|
|
102
|
+
|
|
103
|
+
def _score_noun_content(self, text: str) -> float:
|
|
104
|
+
"""
|
|
105
|
+
Score how noun-like a text is.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
1.0 - Entirely proper noun(s)
|
|
109
|
+
0.8 - Entirely common noun(s)
|
|
110
|
+
0.6 - Contains noun(s) but also other words
|
|
111
|
+
0.2 - No nouns found
|
|
112
|
+
"""
|
|
113
|
+
if not text or not text.strip():
|
|
114
|
+
return 0.2
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
nlp = _get_nlp()
|
|
118
|
+
doc = nlp(text)
|
|
119
|
+
|
|
120
|
+
# Count token types (excluding punctuation and spaces)
|
|
121
|
+
tokens = [t for t in doc if not t.is_punct and not t.is_space]
|
|
122
|
+
if not tokens:
|
|
123
|
+
return 0.2
|
|
124
|
+
|
|
125
|
+
proper_nouns = sum(1 for t in tokens if t.pos_ == "PROPN")
|
|
126
|
+
common_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
|
|
127
|
+
total_nouns = proper_nouns + common_nouns
|
|
128
|
+
total_tokens = len(tokens)
|
|
129
|
+
|
|
130
|
+
if total_nouns == 0:
|
|
131
|
+
# No nouns at all
|
|
132
|
+
return 0.2
|
|
133
|
+
|
|
134
|
+
if total_nouns == total_tokens:
|
|
135
|
+
# Entirely nouns
|
|
136
|
+
if proper_nouns == total_tokens:
|
|
137
|
+
# All proper nouns
|
|
138
|
+
return 1.0
|
|
139
|
+
elif common_nouns == total_tokens:
|
|
140
|
+
# All common nouns
|
|
141
|
+
return 0.8
|
|
142
|
+
else:
|
|
143
|
+
# Mix of proper and common nouns
|
|
144
|
+
return 0.9
|
|
145
|
+
|
|
146
|
+
# Contains nouns but also other words
|
|
147
|
+
# Score based on noun ratio
|
|
148
|
+
noun_ratio = total_nouns / total_tokens
|
|
149
|
+
return 0.4 + (noun_ratio * 0.4) # Range: 0.4 to 0.8
|
|
150
|
+
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.debug(f"Noun scoring failed for '{text}': {e}")
|
|
153
|
+
return 0.5 # Neutral score on error
|
|
154
|
+
|
|
28
155
|
def score_triple(self, statement: Statement, source_text: str) -> float:
|
|
29
156
|
"""
|
|
30
|
-
Score a triple's
|
|
157
|
+
Score a triple's quality (0-1) combining semantic similarity and grammatical accuracy.
|
|
158
|
+
|
|
159
|
+
The score is a weighted combination of:
|
|
160
|
+
- Semantic similarity (50%): How well the triple captures the source meaning
|
|
161
|
+
- Subject noun score (25%): Grammatical quality of subject
|
|
162
|
+
- Object noun score (25%): Grammatical quality of object
|
|
31
163
|
|
|
32
|
-
Higher scores indicate better
|
|
164
|
+
Higher scores indicate better overall quality.
|
|
33
165
|
"""
|
|
34
|
-
if
|
|
166
|
+
# Use statement's source_text if available, otherwise use provided source_text
|
|
167
|
+
reference_text = statement.source_text or source_text
|
|
168
|
+
if not reference_text:
|
|
169
|
+
logger.debug(f" No source text, returning neutral score 0.5")
|
|
35
170
|
return 0.5 # Neutral score if no source text
|
|
36
171
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
statement.object.text,
|
|
60
|
-
source_text
|
|
61
|
-
)
|
|
62
|
-
score += 0.2 * proximity_score
|
|
63
|
-
weights_sum += 0.2
|
|
172
|
+
# Reassemble the triple
|
|
173
|
+
reassembled = f"{statement.subject.text} {statement.predicate} {statement.object.text}"
|
|
174
|
+
|
|
175
|
+
# Compute semantic similarity
|
|
176
|
+
embeddings = self._compute_embeddings([reference_text, reassembled])
|
|
177
|
+
semantic_similarity = self._cosine_similarity(embeddings[0], embeddings[1])
|
|
178
|
+
|
|
179
|
+
# Compute grammatical scores for subject and object
|
|
180
|
+
subject_noun_score = self._score_noun_content(statement.subject.text)
|
|
181
|
+
object_noun_score = self._score_noun_content(statement.object.text)
|
|
182
|
+
|
|
183
|
+
# Weighted combination: 50% semantic, 25% subject, 25% object
|
|
184
|
+
final_score = (
|
|
185
|
+
semantic_similarity * 0.5 +
|
|
186
|
+
subject_noun_score * 0.25 +
|
|
187
|
+
object_noun_score * 0.25
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
logger.debug(
|
|
191
|
+
f" Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
|
|
192
|
+
f"{final_score:.3f} (semantic={semantic_similarity:.2f}, subj_noun={subject_noun_score:.2f}, obj_noun={object_noun_score:.2f})"
|
|
193
|
+
)
|
|
64
194
|
|
|
65
|
-
return
|
|
195
|
+
return final_score
|
|
66
196
|
|
|
67
197
|
def find_evidence_span(
|
|
68
198
|
self,
|
|
@@ -103,54 +233,6 @@ class TripleScorer:
|
|
|
103
233
|
|
|
104
234
|
return None
|
|
105
235
|
|
|
106
|
-
def _text_appears_in(self, text: str, source: str) -> bool:
|
|
107
|
-
"""Check if text appears in source (case-insensitive)."""
|
|
108
|
-
return text.lower() in source.lower()
|
|
109
|
-
|
|
110
|
-
def _predicate_has_trigger(self, predicate: str, source: str) -> bool:
|
|
111
|
-
"""Check if predicate has a lexical trigger in source."""
|
|
112
|
-
# Extract main verb/word from predicate
|
|
113
|
-
words = predicate.lower().split()
|
|
114
|
-
source_lower = source.lower()
|
|
115
|
-
|
|
116
|
-
# Check if any predicate word appears in source
|
|
117
|
-
for word in words:
|
|
118
|
-
if len(word) > 2 and word in source_lower:
|
|
119
|
-
return True
|
|
120
|
-
return False
|
|
121
|
-
|
|
122
|
-
def _compute_proximity(
|
|
123
|
-
self,
|
|
124
|
-
subject_text: str,
|
|
125
|
-
object_text: str,
|
|
126
|
-
source: str
|
|
127
|
-
) -> float:
|
|
128
|
-
"""
|
|
129
|
-
Compute proximity score (0-1) based on distance between subject and object.
|
|
130
|
-
|
|
131
|
-
Returns 1.0 if same sentence, decreasing with distance.
|
|
132
|
-
"""
|
|
133
|
-
source_lower = source.lower()
|
|
134
|
-
subj_pos = source_lower.find(subject_text.lower())
|
|
135
|
-
obj_pos = source_lower.find(object_text.lower())
|
|
136
|
-
|
|
137
|
-
if subj_pos < 0 or obj_pos < 0:
|
|
138
|
-
return 0.0
|
|
139
|
-
|
|
140
|
-
# Check if in same sentence
|
|
141
|
-
start = min(subj_pos, obj_pos)
|
|
142
|
-
end = max(subj_pos, obj_pos)
|
|
143
|
-
region = source[start:end]
|
|
144
|
-
|
|
145
|
-
# If no sentence boundary between them, high proximity
|
|
146
|
-
if '.' not in region and '!' not in region and '?' not in region:
|
|
147
|
-
return 1.0
|
|
148
|
-
|
|
149
|
-
# Otherwise, score decreases with distance
|
|
150
|
-
# Assume ~100 chars per sentence on average
|
|
151
|
-
sentence_distance = region.count('.') + region.count('!') + region.count('?')
|
|
152
|
-
return max(0.0, 1.0 - (sentence_distance * 0.2))
|
|
153
|
-
|
|
154
236
|
def _extend_to_sentence(
|
|
155
237
|
self,
|
|
156
238
|
source: str,
|
|
@@ -347,10 +429,12 @@ class BeamScorer:
|
|
|
347
429
|
return []
|
|
348
430
|
|
|
349
431
|
top_n = top_n or self.config.merge_top_n
|
|
432
|
+
logger.debug(f"Merging beams: {len(candidates)} candidates, selecting top {top_n}")
|
|
350
433
|
|
|
351
434
|
# Score each beam
|
|
352
435
|
scored_beams = []
|
|
353
|
-
for beam in candidates:
|
|
436
|
+
for i, beam in enumerate(candidates):
|
|
437
|
+
logger.debug(f" Scoring beam {i} ({len(beam)} statements)...")
|
|
354
438
|
for stmt in beam:
|
|
355
439
|
if stmt.confidence_score is None:
|
|
356
440
|
stmt.confidence_score = self.triple_scorer.score_triple(stmt, source_text)
|
|
@@ -359,31 +443,36 @@ class BeamScorer:
|
|
|
359
443
|
|
|
360
444
|
beam_score = self.score_beam(beam, source_text)
|
|
361
445
|
scored_beams.append((beam_score, beam))
|
|
446
|
+
logger.debug(f" Beam {i} score: {beam_score:.3f}")
|
|
362
447
|
|
|
363
448
|
# Sort and take top N
|
|
364
449
|
scored_beams.sort(key=lambda x: x[0], reverse=True)
|
|
365
450
|
top_beams = [beam for _, beam in scored_beams[:top_n]]
|
|
451
|
+
logger.debug(f" Selected top {len(top_beams)} beams")
|
|
366
452
|
|
|
367
453
|
# Pool all triples
|
|
368
454
|
all_statements: list[Statement] = []
|
|
369
455
|
for beam in top_beams:
|
|
370
456
|
all_statements.extend(beam)
|
|
457
|
+
logger.debug(f" Pooled {len(all_statements)} statements from top beams")
|
|
371
458
|
|
|
372
459
|
# Filter by confidence threshold
|
|
373
460
|
min_conf = self.config.min_confidence
|
|
374
461
|
filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
|
|
462
|
+
logger.debug(f" After confidence filter (>={min_conf}): {len(filtered)} statements")
|
|
375
463
|
|
|
376
|
-
# Filter out statements where source_text doesn't support the predicate
|
|
377
|
-
# This catches model hallucinations where predicate doesn't match the evidence
|
|
378
|
-
consistent = [
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
]
|
|
464
|
+
# # Filter out statements where source_text doesn't support the predicate
|
|
465
|
+
# # This catches model hallucinations where predicate doesn't match the evidence
|
|
466
|
+
# consistent = [
|
|
467
|
+
# s for s in filtered
|
|
468
|
+
# if self._source_text_supports_predicate(s)
|
|
469
|
+
# ]
|
|
470
|
+
# logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
|
|
382
471
|
|
|
383
472
|
# Deduplicate - keep highest confidence for each (subject, predicate, object)
|
|
384
473
|
# Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
|
|
385
474
|
seen: dict[tuple[str, str, str], Statement] = {}
|
|
386
|
-
for stmt in
|
|
475
|
+
for stmt in all_statements:
|
|
387
476
|
key = (
|
|
388
477
|
stmt.subject.text.lower(),
|
|
389
478
|
stmt.predicate.lower(),
|
|
@@ -392,7 +481,10 @@ class BeamScorer:
|
|
|
392
481
|
if key not in seen or (stmt.confidence_score or 0) > (seen[key].confidence_score or 0):
|
|
393
482
|
seen[key] = stmt
|
|
394
483
|
|
|
395
|
-
|
|
484
|
+
result = list(seen.values())
|
|
485
|
+
logger.debug(f" After deduplication: {len(result)} unique statements")
|
|
486
|
+
|
|
487
|
+
return result
|
|
396
488
|
|
|
397
489
|
def _source_text_supports_predicate(self, stmt: Statement) -> bool:
|
|
398
490
|
"""
|