corp-extractor 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/PKG-INFO +1 -1
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/pyproject.toml +1 -1
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/scoring.py +32 -5
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/.gitignore +0 -0
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/README.md +0 -0
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/__init__.py +0 -0
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/canonicalization.py +0 -0
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/extractor.py +0 -0
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/models.py +0 -0
- {corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/predicate_comparer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
@@ -6,7 +6,6 @@ Provides:
|
|
|
6
6
|
- BeamScorer: Score and select/merge beams based on quality metrics
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
import re
|
|
10
9
|
from typing import Optional
|
|
11
10
|
|
|
12
11
|
from .models import ScoringConfig, Statement
|
|
@@ -138,9 +137,6 @@ class TripleScorer:
|
|
|
138
137
|
if subj_pos < 0 or obj_pos < 0:
|
|
139
138
|
return 0.0
|
|
140
139
|
|
|
141
|
-
# Calculate character distance
|
|
142
|
-
distance = abs(subj_pos - obj_pos)
|
|
143
|
-
|
|
144
140
|
# Check if in same sentence
|
|
145
141
|
start = min(subj_pos, obj_pos)
|
|
146
142
|
end = max(subj_pos, obj_pos)
|
|
@@ -377,10 +373,17 @@ class BeamScorer:
|
|
|
377
373
|
min_conf = self.config.min_confidence
|
|
378
374
|
filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
|
|
379
375
|
|
|
376
|
+
# Filter out statements where source_text doesn't support the predicate
|
|
377
|
+
# This catches model hallucinations where predicate doesn't match the evidence
|
|
378
|
+
consistent = [
|
|
379
|
+
s for s in filtered
|
|
380
|
+
if self._source_text_supports_predicate(s)
|
|
381
|
+
]
|
|
382
|
+
|
|
380
383
|
# Deduplicate - keep highest confidence for each (subject, predicate, object)
|
|
381
384
|
# Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
|
|
382
385
|
seen: dict[tuple[str, str, str], Statement] = {}
|
|
383
|
-
for stmt in
|
|
386
|
+
for stmt in consistent:
|
|
384
387
|
key = (
|
|
385
388
|
stmt.subject.text.lower(),
|
|
386
389
|
stmt.predicate.lower(),
|
|
@@ -390,3 +393,27 @@ class BeamScorer:
|
|
|
390
393
|
seen[key] = stmt
|
|
391
394
|
|
|
392
395
|
return list(seen.values())
|
|
396
|
+
|
|
397
|
+
def _source_text_supports_predicate(self, stmt: Statement) -> bool:
|
|
398
|
+
"""
|
|
399
|
+
Check if a statement's source_text contains a lexical trigger for its predicate.
|
|
400
|
+
|
|
401
|
+
Returns True if:
|
|
402
|
+
- source_text is None (no requirement to check)
|
|
403
|
+
- source_text contains at least one significant word from the predicate
|
|
404
|
+
|
|
405
|
+
Returns False if:
|
|
406
|
+
- source_text is set but contains no words from the predicate
|
|
407
|
+
"""
|
|
408
|
+
if not stmt.source_text:
|
|
409
|
+
return True # No source_text to check
|
|
410
|
+
|
|
411
|
+
predicate_words = stmt.predicate.lower().split()
|
|
412
|
+
source_lower = stmt.source_text.lower()
|
|
413
|
+
|
|
414
|
+
# Check if any significant predicate word appears in source_text
|
|
415
|
+
for word in predicate_words:
|
|
416
|
+
if len(word) > 2 and word in source_lower:
|
|
417
|
+
return True
|
|
418
|
+
|
|
419
|
+
return False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|