corp-extractor 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -83,7 +83,12 @@ class PredicateComparer:
83
83
  # Auto-detect device
84
84
  if device is None:
85
85
  import torch
86
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
86
+ if torch.cuda.is_available():
87
+ self.device = "cuda"
88
+ elif torch.backends.mps.is_available():
89
+ self.device = "mps"
90
+ else:
91
+ self.device = "cpu"
87
92
  else:
88
93
  self.device = device
89
94
 
@@ -289,6 +294,8 @@ class PredicateComparer:
289
294
  Returns:
290
295
  Deduplicated list of statements (keeps best contextualized match)
291
296
  """
297
+ logger.debug(f"Embedding deduplication: {len(statements)} statements, detect_reversals={detect_reversals}")
298
+
292
299
  if len(statements) <= 1:
293
300
  return statements
294
301
 
@@ -297,27 +304,33 @@ class PredicateComparer:
297
304
  return entity_canonicalizer(text)
298
305
  return text.lower().strip()
299
306
 
307
+ logger.debug(" Computing predicate embeddings...")
300
308
  # Compute all predicate embeddings at once for efficiency
301
309
  predicates = [s.predicate for s in statements]
302
310
  pred_embeddings = self._compute_embeddings(predicates)
311
+ logger.debug(f" Computed {len(pred_embeddings)} predicate embeddings")
303
312
 
313
+ logger.debug(" Computing contextualized embeddings (S P O)...")
304
314
  # Compute contextualized embeddings: "Subject Predicate Object" for each statement
305
315
  contextualized_texts = [
306
316
  f"{s.subject.text} {s.predicate} {s.object.text}" for s in statements
307
317
  ]
308
318
  contextualized_embeddings = self._compute_embeddings(contextualized_texts)
309
319
 
320
+ logger.debug(" Computing reversed embeddings (O P S)...")
310
321
  # Compute reversed contextualized embeddings: "Object Predicate Subject"
311
322
  reversed_texts = [
312
323
  f"{s.object.text} {s.predicate} {s.subject.text}" for s in statements
313
324
  ]
314
325
  reversed_embeddings = self._compute_embeddings(reversed_texts)
315
326
 
327
+ logger.debug(" Computing source text embeddings...")
316
328
  # Compute source text embeddings for scoring which duplicate to keep
317
329
  source_embeddings = []
318
330
  for stmt in statements:
319
331
  source_text = stmt.source_text or f"{stmt.subject.text} {stmt.predicate} {stmt.object.text}"
320
332
  source_embeddings.append(self._compute_embeddings([source_text])[0])
333
+ logger.debug(" All embeddings computed, starting comparison loop...")
321
334
 
322
335
  unique_statements: list[Statement] = []
323
336
  unique_pred_embeddings: list[np.ndarray] = []
@@ -358,9 +371,17 @@ class PredicateComparer:
358
371
  if similarity >= self.config.dedup_threshold:
359
372
  duplicate_idx = j
360
373
  is_reversed_match = reversed_match and not direct_match
374
+ match_type = "reversed" if is_reversed_match else "direct"
375
+ logger.debug(
376
+ f" [{i}] DUPLICATE of [{unique_indices[j]}] ({match_type}, sim={similarity:.3f}): "
377
+ f"'{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
378
+ )
361
379
  break
362
380
 
363
381
  if duplicate_idx is None:
382
+ logger.debug(
383
+ f" [{i}] UNIQUE: '{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
384
+ )
364
385
  # Not a duplicate - add to unique list
365
386
  unique_statements.append(stmt)
366
387
  unique_pred_embeddings.append(pred_embeddings[i])
@@ -451,6 +472,7 @@ class PredicateComparer:
451
472
  merged_stmt = existing_stmt.merge_entity_types_from(stmt)
452
473
  unique_statements[duplicate_idx] = merged_stmt
453
474
 
475
+ logger.debug(f" Deduplication complete: {len(statements)} -> {len(unique_statements)} statements")
454
476
  return unique_statements
455
477
 
456
478
  def normalize_predicates(
@@ -2,67 +2,197 @@
2
2
  Scoring module for statement extraction quality assessment.
3
3
 
4
4
  Provides:
5
- - TripleScorer: Score individual triples for groundedness
5
+ - TripleScorer: Score individual triples combining semantic similarity and grammatical accuracy
6
6
  - BeamScorer: Score and select/merge beams based on quality metrics
7
7
  """
8
8
 
9
+ import logging
9
10
  from typing import Optional
10
11
 
12
+ import numpy as np
13
+
11
14
  from .models import ScoringConfig, Statement
12
15
 
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Lazy-loaded spaCy model for grammatical analysis
19
+ _nlp = None
20
+
21
+
22
+ def _get_nlp():
23
+ """Lazy-load spaCy model for POS tagging."""
24
+ global _nlp
25
+ if _nlp is None:
26
+ import spacy
27
+ try:
28
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
29
+ except OSError:
30
+ # Model not found, try to download
31
+ from .spacy_extraction import _download_model
32
+ if _download_model():
33
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
34
+ else:
35
+ raise
36
+ return _nlp
37
+
13
38
 
14
39
  class TripleScorer:
15
40
  """
16
- Score individual triples for groundedness in source text.
17
-
18
- Groundedness is measured by checking:
19
- - Subject text appears in source
20
- - Object text appears in source
21
- - Subject and object are in proximity (same/nearby sentences)
22
- - Evidence span exists and is valid
41
+ Score individual triples combining semantic similarity and grammatical accuracy.
42
+
43
+ The score is a weighted combination of:
44
+ - Semantic similarity (50%): Cosine similarity between source text and reassembled triple
45
+ - Subject noun score (25%): How noun-like the subject is
46
+ - Object noun score (25%): How noun-like the object is
47
+
48
+ Noun scoring:
49
+ - Proper noun only (PROPN): 1.0
50
+ - Common noun only (NOUN): 0.8
51
+ - Contains noun + other words: 0.6
52
+ - No noun: 0.2
23
53
  """
24
54
 
25
- def __init__(self, config: Optional[ScoringConfig] = None):
55
+ def __init__(
56
+ self,
57
+ config: Optional[ScoringConfig] = None,
58
+ device: Optional[str] = None,
59
+ ):
26
60
  self.config = config or ScoringConfig()
27
61
 
62
+ # Auto-detect device
63
+ if device is None:
64
+ import torch
65
+ if torch.cuda.is_available():
66
+ self.device = "cuda"
67
+ elif torch.backends.mps.is_available():
68
+ self.device = "mps"
69
+ else:
70
+ self.device = "cpu"
71
+ else:
72
+ self.device = device
73
+
74
+ # Lazy-loaded embedding model
75
+ self._model = None
76
+ self._embedding_model_name = "all-MiniLM-L6-v2"
77
+
78
+ def _load_model(self):
79
+ """Load sentence-transformers model lazily."""
80
+ if self._model is not None:
81
+ return
82
+
83
+ from sentence_transformers import SentenceTransformer
84
+
85
+ logger.debug(f"Loading embedding model: {self._embedding_model_name} on {self.device}")
86
+ self._model = SentenceTransformer(self._embedding_model_name, device=self.device)
87
+ logger.debug(f"Embedding model loaded on {self.device}")
88
+
89
+ def _compute_embeddings(self, texts: list[str]) -> np.ndarray:
90
+ """Compute embeddings for a list of texts."""
91
+ self._load_model()
92
+ return self._model.encode(texts, convert_to_numpy=True)
93
+
94
+ def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
95
+ """Compute cosine similarity between two vectors."""
96
+ dot = np.dot(vec1, vec2)
97
+ norm1 = np.linalg.norm(vec1)
98
+ norm2 = np.linalg.norm(vec2)
99
+ if norm1 == 0 or norm2 == 0:
100
+ return 0.0
101
+ return float(dot / (norm1 * norm2))
102
+
103
+ def _score_noun_content(self, text: str) -> float:
104
+ """
105
+ Score how noun-like a text is.
106
+
107
+ Returns:
108
+ 1.0 - Entirely proper noun(s)
109
+ 0.8 - Entirely common noun(s)
110
+ 0.6 - Contains noun(s) but also other words
111
+ 0.2 - No nouns found
112
+ """
113
+ if not text or not text.strip():
114
+ return 0.2
115
+
116
+ try:
117
+ nlp = _get_nlp()
118
+ doc = nlp(text)
119
+
120
+ # Count token types (excluding punctuation and spaces)
121
+ tokens = [t for t in doc if not t.is_punct and not t.is_space]
122
+ if not tokens:
123
+ return 0.2
124
+
125
+ proper_nouns = sum(1 for t in tokens if t.pos_ == "PROPN")
126
+ common_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
127
+ total_nouns = proper_nouns + common_nouns
128
+ total_tokens = len(tokens)
129
+
130
+ if total_nouns == 0:
131
+ # No nouns at all
132
+ return 0.2
133
+
134
+ if total_nouns == total_tokens:
135
+ # Entirely nouns
136
+ if proper_nouns == total_tokens:
137
+ # All proper nouns
138
+ return 1.0
139
+ elif common_nouns == total_tokens:
140
+ # All common nouns
141
+ return 0.8
142
+ else:
143
+ # Mix of proper and common nouns
144
+ return 0.9
145
+
146
+ # Contains nouns but also other words
147
+ # Score based on noun ratio
148
+ noun_ratio = total_nouns / total_tokens
149
+ return 0.4 + (noun_ratio * 0.4) # Range: 0.4 to 0.8
150
+
151
+ except Exception as e:
152
+ logger.debug(f"Noun scoring failed for '{text}': {e}")
153
+ return 0.5 # Neutral score on error
154
+
28
155
  def score_triple(self, statement: Statement, source_text: str) -> float:
29
156
  """
30
- Score a triple's groundedness (0-1).
157
+ Score a triple's quality (0-1) combining semantic similarity and grammatical accuracy.
158
+
159
+ The score is a weighted combination of:
160
+ - Semantic similarity (50%): How well the triple captures the source meaning
161
+ - Subject noun score (25%): Grammatical quality of subject
162
+ - Object noun score (25%): Grammatical quality of object
31
163
 
32
- Higher scores indicate better grounding in source text.
164
+ Higher scores indicate better overall quality.
33
165
  """
34
- if not source_text:
166
+ # Use statement's source_text if available, otherwise use provided source_text
167
+ reference_text = statement.source_text or source_text
168
+ if not reference_text:
169
+ logger.debug(f" No source text, returning neutral score 0.5")
35
170
  return 0.5 # Neutral score if no source text
36
171
 
37
- score = 0.0
38
- weights_sum = 0.0
39
-
40
- # Check subject appears in source (weight: 0.3)
41
- subject_found = self._text_appears_in(statement.subject.text, source_text)
42
- score += 0.3 * (1.0 if subject_found else 0.0)
43
- weights_sum += 0.3
44
-
45
- # Check object appears in source (weight: 0.3)
46
- object_found = self._text_appears_in(statement.object.text, source_text)
47
- score += 0.3 * (1.0 if object_found else 0.0)
48
- weights_sum += 0.3
49
-
50
- # Check predicate has lexical trigger (weight: 0.2)
51
- predicate_grounded = self._predicate_has_trigger(statement.predicate, source_text)
52
- score += 0.2 * (1.0 if predicate_grounded else 0.0)
53
- weights_sum += 0.2
54
-
55
- # Check proximity - subject and object in same/nearby region (weight: 0.2)
56
- if subject_found and object_found:
57
- proximity_score = self._compute_proximity(
58
- statement.subject.text,
59
- statement.object.text,
60
- source_text
61
- )
62
- score += 0.2 * proximity_score
63
- weights_sum += 0.2
172
+ # Reassemble the triple
173
+ reassembled = f"{statement.subject.text} {statement.predicate} {statement.object.text}"
174
+
175
+ # Compute semantic similarity
176
+ embeddings = self._compute_embeddings([reference_text, reassembled])
177
+ semantic_similarity = self._cosine_similarity(embeddings[0], embeddings[1])
178
+
179
+ # Compute grammatical scores for subject and object
180
+ subject_noun_score = self._score_noun_content(statement.subject.text)
181
+ object_noun_score = self._score_noun_content(statement.object.text)
182
+
183
+ # Weighted combination: 50% semantic, 25% subject, 25% object
184
+ final_score = (
185
+ semantic_similarity * 0.5 +
186
+ subject_noun_score * 0.25 +
187
+ object_noun_score * 0.25
188
+ )
189
+
190
+ logger.debug(
191
+ f" Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
192
+ f"{final_score:.3f} (semantic={semantic_similarity:.2f}, subj_noun={subject_noun_score:.2f}, obj_noun={object_noun_score:.2f})"
193
+ )
64
194
 
65
- return score / weights_sum if weights_sum > 0 else 0.0
195
+ return final_score
66
196
 
67
197
  def find_evidence_span(
68
198
  self,
@@ -103,54 +233,6 @@ class TripleScorer:
103
233
 
104
234
  return None
105
235
 
106
- def _text_appears_in(self, text: str, source: str) -> bool:
107
- """Check if text appears in source (case-insensitive)."""
108
- return text.lower() in source.lower()
109
-
110
- def _predicate_has_trigger(self, predicate: str, source: str) -> bool:
111
- """Check if predicate has a lexical trigger in source."""
112
- # Extract main verb/word from predicate
113
- words = predicate.lower().split()
114
- source_lower = source.lower()
115
-
116
- # Check if any predicate word appears in source
117
- for word in words:
118
- if len(word) > 2 and word in source_lower:
119
- return True
120
- return False
121
-
122
- def _compute_proximity(
123
- self,
124
- subject_text: str,
125
- object_text: str,
126
- source: str
127
- ) -> float:
128
- """
129
- Compute proximity score (0-1) based on distance between subject and object.
130
-
131
- Returns 1.0 if same sentence, decreasing with distance.
132
- """
133
- source_lower = source.lower()
134
- subj_pos = source_lower.find(subject_text.lower())
135
- obj_pos = source_lower.find(object_text.lower())
136
-
137
- if subj_pos < 0 or obj_pos < 0:
138
- return 0.0
139
-
140
- # Check if in same sentence
141
- start = min(subj_pos, obj_pos)
142
- end = max(subj_pos, obj_pos)
143
- region = source[start:end]
144
-
145
- # If no sentence boundary between them, high proximity
146
- if '.' not in region and '!' not in region and '?' not in region:
147
- return 1.0
148
-
149
- # Otherwise, score decreases with distance
150
- # Assume ~100 chars per sentence on average
151
- sentence_distance = region.count('.') + region.count('!') + region.count('?')
152
- return max(0.0, 1.0 - (sentence_distance * 0.2))
153
-
154
236
  def _extend_to_sentence(
155
237
  self,
156
238
  source: str,
@@ -347,10 +429,12 @@ class BeamScorer:
347
429
  return []
348
430
 
349
431
  top_n = top_n or self.config.merge_top_n
432
+ logger.debug(f"Merging beams: {len(candidates)} candidates, selecting top {top_n}")
350
433
 
351
434
  # Score each beam
352
435
  scored_beams = []
353
- for beam in candidates:
436
+ for i, beam in enumerate(candidates):
437
+ logger.debug(f" Scoring beam {i} ({len(beam)} statements)...")
354
438
  for stmt in beam:
355
439
  if stmt.confidence_score is None:
356
440
  stmt.confidence_score = self.triple_scorer.score_triple(stmt, source_text)
@@ -359,31 +443,36 @@ class BeamScorer:
359
443
 
360
444
  beam_score = self.score_beam(beam, source_text)
361
445
  scored_beams.append((beam_score, beam))
446
+ logger.debug(f" Beam {i} score: {beam_score:.3f}")
362
447
 
363
448
  # Sort and take top N
364
449
  scored_beams.sort(key=lambda x: x[0], reverse=True)
365
450
  top_beams = [beam for _, beam in scored_beams[:top_n]]
451
+ logger.debug(f" Selected top {len(top_beams)} beams")
366
452
 
367
453
  # Pool all triples
368
454
  all_statements: list[Statement] = []
369
455
  for beam in top_beams:
370
456
  all_statements.extend(beam)
457
+ logger.debug(f" Pooled {len(all_statements)} statements from top beams")
371
458
 
372
459
  # Filter by confidence threshold
373
460
  min_conf = self.config.min_confidence
374
461
  filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
462
+ logger.debug(f" After confidence filter (>={min_conf}): {len(filtered)} statements")
375
463
 
376
- # Filter out statements where source_text doesn't support the predicate
377
- # This catches model hallucinations where predicate doesn't match the evidence
378
- consistent = [
379
- s for s in filtered
380
- if self._source_text_supports_predicate(s)
381
- ]
464
+ # # Filter out statements where source_text doesn't support the predicate
465
+ # # This catches model hallucinations where predicate doesn't match the evidence
466
+ # consistent = [
467
+ # s for s in filtered
468
+ # if self._source_text_supports_predicate(s)
469
+ # ]
470
+ # logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
382
471
 
383
472
  # Deduplicate - keep highest confidence for each (subject, predicate, object)
384
473
  # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
385
474
  seen: dict[tuple[str, str, str], Statement] = {}
386
- for stmt in consistent:
475
+ for stmt in all_statements:
387
476
  key = (
388
477
  stmt.subject.text.lower(),
389
478
  stmt.predicate.lower(),
@@ -392,7 +481,10 @@ class BeamScorer:
392
481
  if key not in seen or (stmt.confidence_score or 0) > (seen[key].confidence_score or 0):
393
482
  seen[key] = stmt
394
483
 
395
- return list(seen.values())
484
+ result = list(seen.values())
485
+ logger.debug(f" After deduplication: {len(result)} unique statements")
486
+
487
+ return result
396
488
 
397
489
  def _source_text_supports_predicate(self, stmt: Statement) -> bool:
398
490
  """