corp-extractor 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "corp-extractor"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  description = "Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -6,7 +6,6 @@ Provides:
6
6
  - BeamScorer: Score and select/merge beams based on quality metrics
7
7
  """
8
8
 
9
- import re
10
9
  from typing import Optional
11
10
 
12
11
  from .models import ScoringConfig, Statement
@@ -138,9 +137,6 @@ class TripleScorer:
138
137
  if subj_pos < 0 or obj_pos < 0:
139
138
  return 0.0
140
139
 
141
- # Calculate character distance
142
- distance = abs(subj_pos - obj_pos)
143
-
144
140
  # Check if in same sentence
145
141
  start = min(subj_pos, obj_pos)
146
142
  end = max(subj_pos, obj_pos)
@@ -377,10 +373,17 @@ class BeamScorer:
377
373
  min_conf = self.config.min_confidence
378
374
  filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
379
375
 
376
+ # Filter out statements where source_text doesn't support the predicate
377
+ # This catches model hallucinations where predicate doesn't match the evidence
378
+ consistent = [
379
+ s for s in filtered
380
+ if self._source_text_supports_predicate(s)
381
+ ]
382
+
380
383
  # Deduplicate - keep highest confidence for each (subject, predicate, object)
381
384
  # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
382
385
  seen: dict[tuple[str, str, str], Statement] = {}
383
- for stmt in filtered:
386
+ for stmt in consistent:
384
387
  key = (
385
388
  stmt.subject.text.lower(),
386
389
  stmt.predicate.lower(),
@@ -390,3 +393,27 @@ class BeamScorer:
390
393
  seen[key] = stmt
391
394
 
392
395
  return list(seen.values())
396
+
397
+ def _source_text_supports_predicate(self, stmt: Statement) -> bool:
398
+ """
399
+ Check if a statement's source_text contains a lexical trigger for its predicate.
400
+
401
+ Returns True if:
402
+ - source_text is None (no requirement to check)
403
+ - source_text contains at least one significant word from the predicate
404
+
405
+ Returns False if:
406
+ - source_text is set but contains no words from the predicate
407
+ """
408
+ if not stmt.source_text:
409
+ return True # No source_text to check
410
+
411
+ predicate_words = stmt.predicate.lower().split()
412
+ source_lower = stmt.source_text.lower()
413
+
414
+ # Check if any significant predicate word appears in source_text
415
+ for word in predicate_words:
416
+ if len(word) > 2 and word in source_lower:
417
+ return True
418
+
419
+ return False
File without changes