PyPI - corp-extractor - Versions diffs - 0.2.0__tar.gz → 0.2.1__tar.gz - Mend

corp-extractor 0.2.0tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: corp-extractor
-Version: 0.2.0
+Version: 0.2.1
 Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
 Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
 Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "corp-extractor"
-version = "0.2.0"
+version = "0.2.1"
 description = "Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search"
 readme = "README.md"
 requires-python = ">=3.10"

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/scoring.py RENAMED Viewed

@@ -6,7 +6,6 @@ Provides:
 - BeamScorer: Score and select/merge beams based on quality metrics
 """
-import re
 from typing import Optional
 from .models import ScoringConfig, Statement
@@ -138,9 +137,6 @@ class TripleScorer:
         if subj_pos < 0 or obj_pos < 0:
             return 0.0
-        # Calculate character distance
-        distance = abs(subj_pos - obj_pos)
         # Check if in same sentence
         start = min(subj_pos, obj_pos)
         end = max(subj_pos, obj_pos)
@@ -377,10 +373,17 @@ class BeamScorer:
         min_conf = self.config.min_confidence
         filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
+        # Filter out statements where source_text doesn't support the predicate
+        # This catches model hallucinations where predicate doesn't match the evidence
+        consistent = [
+            s for s in filtered
+            if self._source_text_supports_predicate(s)
+        ]
         # Deduplicate - keep highest confidence for each (subject, predicate, object)
         # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
         seen: dict[tuple[str, str, str], Statement] = {}
-        for stmt in filtered:
+        for stmt in consistent:
             key = (
                 stmt.subject.text.lower(),
                 stmt.predicate.lower(),
@@ -390,3 +393,27 @@ class BeamScorer:
                 seen[key] = stmt
         return list(seen.values())
+    def _source_text_supports_predicate(self, stmt: Statement) -> bool:
+        """
+        Check if a statement's source_text contains a lexical trigger for its predicate.
+        Returns True if:
+        - source_text is None (no requirement to check)
+        - source_text contains at least one significant word from the predicate
+        Returns False if:
+        - source_text is set but contains no words from the predicate
+        """
+        if not stmt.source_text:
+            return True  # No source_text to check
+        predicate_words = stmt.predicate.lower().split()
+        source_lower = stmt.source_text.lower()
+        # Check if any significant predicate word appears in source_text
+        for word in predicate_words:
+            if len(word) > 2 and word in source_lower:
+                return True
+        return False

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/.gitignore RENAMED Viewed

File without changes

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/README.md RENAMED Viewed

File without changes

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/__init__.py RENAMED Viewed

File without changes

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/canonicalization.py RENAMED Viewed

File without changes

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/extractor.py RENAMED Viewed

File without changes

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/models.py RENAMED Viewed

File without changes

{corp_extractor-0.2.0 → corp_extractor-0.2.1}/src/statement_extractor/predicate_comparer.py RENAMED Viewed

File without changes

corp-extractor 0.2.0__tar.gz → 0.2.1__tar.gz

corp-extractor 0.2.0tar.gz → 0.2.1tar.gz