PyPI - corp-extractor - Versions diffs - 0.2.11__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

corp-extractor 0.2.11py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/METADATA +104 -19
corp_extractor-0.3.0.dist-info/RECORD +12 -0
statement_extractor/__init__.py +3 -1
statement_extractor/cli.py +10 -0
statement_extractor/extractor.py +305 -22
statement_extractor/models.py +27 -1
statement_extractor/scoring.py +160 -90
statement_extractor/spacy_extraction.py +386 -0
corp_extractor-0.2.11.dist-info/RECORD +0 -11
{corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/entry_points.txt +0 -0

statement_extractor/scoring.py CHANGED Viewed

@@ -2,76 +2,194 @@
 Scoring module for statement extraction quality assessment.
 Provides:
-- TripleScorer: Score individual triples for groundedness
+- TripleScorer: Score individual triples combining semantic similarity and grammatical accuracy
 - BeamScorer: Score and select/merge beams based on quality metrics
 """
 import logging
 from typing import Optional
+import numpy as np
 from .models import ScoringConfig, Statement
 logger = logging.getLogger(__name__)
+# Lazy-loaded spaCy model for grammatical analysis
+_nlp = None
+def _get_nlp():
+    """Lazy-load spaCy model for POS tagging."""
+    global _nlp
+    if _nlp is None:
+        import spacy
+        try:
+            _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
+        except OSError:
+            # Model not found, try to download
+            from .spacy_extraction import _download_model
+            if _download_model():
+                _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
+            else:
+                raise
+    return _nlp
 class TripleScorer:
     """
-    Score individual triples for groundedness in source text.
-    Groundedness is measured by checking:
-    - Subject text appears in source
-    - Object text appears in source
-    - Subject and object are in proximity (same/nearby sentences)
-    - Evidence span exists and is valid
+    Score individual triples combining semantic similarity and grammatical accuracy.
+    The score is a weighted combination of:
+    - Semantic similarity (50%): Cosine similarity between source text and reassembled triple
+    - Subject noun score (25%): How noun-like the subject is
+    - Object noun score (25%): How noun-like the object is
+    Noun scoring:
+    - Proper noun only (PROPN): 1.0
+    - Common noun only (NOUN): 0.8
+    - Contains noun + other words: 0.6
+    - No noun: 0.2
     """
-    def __init__(self, config: Optional[ScoringConfig] = None):
+    def __init__(
+        self,
+        config: Optional[ScoringConfig] = None,
+        device: Optional[str] = None,
+    ):
         self.config = config or ScoringConfig()
+        # Auto-detect device
+        if device is None:
+            import torch
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        # Lazy-loaded embedding model
+        self._model = None
+        self._embedding_model_name = "all-MiniLM-L6-v2"
+    def _load_model(self):
+        """Load sentence-transformers model lazily."""
+        if self._model is not None:
+            return
+        from sentence_transformers import SentenceTransformer
+        logger.debug(f"Loading embedding model: {self._embedding_model_name} on {self.device}")
+        self._model = SentenceTransformer(self._embedding_model_name, device=self.device)
+        logger.debug(f"Embedding model loaded on {self.device}")
+    def _compute_embeddings(self, texts: list[str]) -> np.ndarray:
+        """Compute embeddings for a list of texts."""
+        self._load_model()
+        return self._model.encode(texts, convert_to_numpy=True)
+    def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
+        """Compute cosine similarity between two vectors."""
+        dot = np.dot(vec1, vec2)
+        norm1 = np.linalg.norm(vec1)
+        norm2 = np.linalg.norm(vec2)
+        if norm1 == 0 or norm2 == 0:
+            return 0.0
+        return float(dot / (norm1 * norm2))
+    def _score_noun_content(self, text: str) -> float:
+        """
+        Score how noun-like a text is.
+        Returns:
+            1.0 - Entirely proper noun(s)
+            0.8 - Entirely common noun(s)
+            0.6 - Contains noun(s) but also other words
+            0.2 - No nouns found
+        """
+        if not text or not text.strip():
+            return 0.2
+        try:
+            nlp = _get_nlp()
+            doc = nlp(text)
+            # Count token types (excluding punctuation and spaces)
+            tokens = [t for t in doc if not t.is_punct and not t.is_space]
+            if not tokens:
+                return 0.2
+            proper_nouns = sum(1 for t in tokens if t.pos_ == "PROPN")
+            common_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
+            total_nouns = proper_nouns + common_nouns
+            total_tokens = len(tokens)
+            if total_nouns == 0:
+                # No nouns at all
+                return 0.2
+            if total_nouns == total_tokens:
+                # Entirely nouns
+                if proper_nouns == total_tokens:
+                    # All proper nouns
+                    return 1.0
+                elif common_nouns == total_tokens:
+                    # All common nouns
+                    return 0.8
+                else:
+                    # Mix of proper and common nouns
+                    return 0.9
+            # Contains nouns but also other words
+            # Score based on noun ratio
+            noun_ratio = total_nouns / total_tokens
+            return 0.4 + (noun_ratio * 0.4)  # Range: 0.4 to 0.8
+        except Exception as e:
+            logger.debug(f"Noun scoring failed for '{text}': {e}")
+            return 0.5  # Neutral score on error
     def score_triple(self, statement: Statement, source_text: str) -> float:
         """
-        Score a triple's groundedness (0-1).
+        Score a triple's quality (0-1) combining semantic similarity and grammatical accuracy.
-        Higher scores indicate better grounding in source text.
+        The score is a weighted combination of:
+        - Semantic similarity (50%): How well the triple captures the source meaning
+        - Subject noun score (25%): Grammatical quality of subject
+        - Object noun score (25%): Grammatical quality of object
+        Higher scores indicate better overall quality.
         """
-        if not source_text:
+        # Use statement's source_text if available, otherwise use provided source_text
+        reference_text = statement.source_text or source_text
+        if not reference_text:
             logger.debug(f"  No source text, returning neutral score 0.5")
             return 0.5  # Neutral score if no source text
-        score = 0.0
-        weights_sum = 0.0
-        # Check subject appears in source (weight: 0.3)
-        subject_found = self._text_appears_in(statement.subject.text, source_text)
-        score += 0.3 * (1.0 if subject_found else 0.0)
-        weights_sum += 0.3
-        # Check object appears in source (weight: 0.3)
-        object_found = self._text_appears_in(statement.object.text, source_text)
-        score += 0.3 * (1.0 if object_found else 0.0)
-        weights_sum += 0.3
-        # Check predicate has lexical trigger (weight: 0.2)
-        predicate_grounded = self._predicate_has_trigger(statement.predicate, source_text)
-        score += 0.2 * (1.0 if predicate_grounded else 0.0)
-        weights_sum += 0.2
-        # Check proximity - subject and object in same/nearby region (weight: 0.2)
-        proximity_score = 0.0
-        if subject_found and object_found:
-            proximity_score = self._compute_proximity(
-                statement.subject.text,
-                statement.object.text,
-                source_text
-            )
-            score += 0.2 * proximity_score
-        weights_sum += 0.2
+        # Reassemble the triple
+        reassembled = f"{statement.subject.text} {statement.predicate} {statement.object.text}"
+        # Compute semantic similarity
+        embeddings = self._compute_embeddings([reference_text, reassembled])
+        semantic_similarity = self._cosine_similarity(embeddings[0], embeddings[1])
-        final_score = score / weights_sum if weights_sum > 0 else 0.0
+        # Compute grammatical scores for subject and object
+        subject_noun_score = self._score_noun_content(statement.subject.text)
+        object_noun_score = self._score_noun_content(statement.object.text)
+        # Weighted combination: 50% semantic, 25% subject, 25% object
+        final_score = (
+            semantic_similarity * 0.5 +
+            subject_noun_score * 0.25 +
+            object_noun_score * 0.25
+        )
         logger.debug(
             f"  Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
-            f"{final_score:.2f} (subj={subject_found}, obj={object_found}, pred={predicate_grounded}, prox={proximity_score:.2f})"
+            f"{final_score:.3f} (semantic={semantic_similarity:.2f}, subj_noun={subject_noun_score:.2f}, obj_noun={object_noun_score:.2f})"
         )
         return final_score
@@ -115,54 +233,6 @@ class TripleScorer:
         return None
-    def _text_appears_in(self, text: str, source: str) -> bool:
-        """Check if text appears in source (case-insensitive)."""
-        return text.lower() in source.lower()
-    def _predicate_has_trigger(self, predicate: str, source: str) -> bool:
-        """Check if predicate has a lexical trigger in source."""
-        # Extract main verb/word from predicate
-        words = predicate.lower().split()
-        source_lower = source.lower()
-        # Check if any predicate word appears in source
-        for word in words:
-            if len(word) > 2 and word in source_lower:
-                return True
-        return False
-    def _compute_proximity(
-        self,
-        subject_text: str,
-        object_text: str,
-        source: str
-    ) -> float:
-        """
-        Compute proximity score (0-1) based on distance between subject and object.
-        Returns 1.0 if same sentence, decreasing with distance.
-        """
-        source_lower = source.lower()
-        subj_pos = source_lower.find(subject_text.lower())
-        obj_pos = source_lower.find(object_text.lower())
-        if subj_pos < 0 or obj_pos < 0:
-            return 0.0
-        # Check if in same sentence
-        start = min(subj_pos, obj_pos)
-        end = max(subj_pos, obj_pos)
-        region = source[start:end]
-        # If no sentence boundary between them, high proximity
-        if '.' not in region and '!' not in region and '?' not in region:
-            return 1.0
-        # Otherwise, score decreases with distance
-        # Assume ~100 chars per sentence on average
-        sentence_distance = region.count('.') + region.count('!') + region.count('?')
-        return max(0.0, 1.0 - (sentence_distance * 0.2))
     def _extend_to_sentence(
         self,
         source: str,

statement_extractor/spacy_extraction.py ADDED Viewed

@@ -0,0 +1,386 @@
+"""
+spaCy-based triple extraction.
+Uses spaCy dependency parsing to extract subject, predicate, and object
+from source text. T5-Gemma model provides triple structure and coreference
+resolution, while spaCy handles linguistic analysis.
+The spaCy model is downloaded automatically on first use.
+"""
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Lazy-loaded spaCy model
+_nlp = None
+def _download_model():
+    """Download the spaCy model if not present."""
+    import shutil
+    import subprocess
+    import sys
+    # Direct URL to the spaCy model wheel
+    MODEL_URL = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
+    logger.info("Downloading spaCy model 'en_core_web_sm'...")
+    # Try uv first (for uv-managed environments)
+    uv_path = shutil.which("uv")
+    if uv_path:
+        try:
+            result = subprocess.run(
+                [uv_path, "pip", "install", MODEL_URL],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode == 0:
+                logger.info("Successfully downloaded spaCy model via uv")
+                return True
+            logger.debug(f"uv pip install failed: {result.stderr}")
+        except Exception as e:
+            logger.debug(f"uv pip install failed: {e}")
+    # Try pip directly
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "install", MODEL_URL],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode == 0:
+            logger.info("Successfully downloaded spaCy model via pip")
+            return True
+        logger.debug(f"pip install failed: {result.stderr}")
+    except Exception as e:
+        logger.debug(f"pip install failed: {e}")
+    # Try spacy's download as last resort
+    try:
+        from spacy.cli import download
+        download("en_core_web_sm")
+        # Check if it actually worked
+        import spacy
+        spacy.load("en_core_web_sm")
+        logger.info("Successfully downloaded spaCy model via spacy")
+        return True
+    except Exception:
+        pass
+    logger.warning(
+        "Failed to download spaCy model automatically. "
+        "Please run: uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
+    )
+    return False
+def _get_nlp():
+    """
+    Lazy-load the spaCy model.
+    Disables NER and lemmatizer for faster processing since we only
+    need dependency parsing. Automatically downloads the model if not present.
+    """
+    global _nlp
+    if _nlp is None:
+        import spacy
+        # Try to load the model, download if not present
+        try:
+            _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
+            logger.debug("Loaded spaCy model for extraction")
+        except OSError:
+            # Model not found, try to download it
+            if _download_model():
+                _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
+                logger.debug("Loaded spaCy model after download")
+            else:
+                raise OSError(
+                    "spaCy model not found and automatic download failed. "
+                    "Please run: python -m spacy download en_core_web_sm"
+                )
+    return _nlp
+def _get_full_noun_phrase(token) -> str:
+    """
+    Get the full noun phrase for a token, including compounds and modifiers.
+    """
+    # Get all tokens in the subtree that form the noun phrase
+    phrase_tokens = []
+    # Collect compound modifiers and the token itself
+    for t in token.subtree:
+        # Include compounds, adjectives, determiners, and the head noun
+        if t.dep_ in ("compound", "amod", "det", "poss", "nummod", "nmod") or t == token:
+            phrase_tokens.append(t)
+    # Sort by position and join
+    phrase_tokens.sort(key=lambda x: x.i)
+    return " ".join([t.text for t in phrase_tokens])
+def _extract_verb_phrase(verb_token) -> str:
+    """
+    Extract the full verb phrase including auxiliaries and particles.
+    """
+    parts = []
+    # Collect auxiliaries that come before the verb
+    for child in verb_token.children:
+        if child.dep_ in ("aux", "auxpass") and child.i < verb_token.i:
+            parts.append((child.i, child.text))
+    # Add the main verb
+    parts.append((verb_token.i, verb_token.text))
+    # Collect particles and prepositions that are part of phrasal verbs
+    for child in verb_token.children:
+        if child.dep_ == "prt" and child.i > verb_token.i:
+            parts.append((child.i, child.text))
+        # Include prepositions for phrasal verbs like "announced by"
+        elif child.dep_ == "agent" and child.i > verb_token.i:
+            # For passive constructions, include "by"
+            parts.append((child.i, child.text))
+    # Sort by position and join
+    parts.sort(key=lambda x: x[0])
+    return " ".join([p[1] for p in parts])
+def _match_entity_boundaries(
+    spacy_text: str,
+    model_text: str,
+    source_text: str,
+) -> str:
+    """
+    Match entity boundaries between spaCy extraction and model hint.
+    If model text is a superset that includes spaCy text, use model text
+    for better entity boundaries (e.g., "Apple" -> "Apple Inc.").
+    """
+    spacy_lower = spacy_text.lower()
+    model_lower = model_text.lower()
+    # If model text contains spaCy text, prefer model text
+    if spacy_lower in model_lower:
+        return model_text
+    # If spaCy text contains model text, prefer spaCy text
+    if model_lower in spacy_lower:
+        return spacy_text
+    # If they overlap significantly, prefer the one that appears in source
+    if spacy_text in source_text:
+        return spacy_text
+    if model_text in source_text:
+        return model_text
+    # Default to spaCy extraction
+    return spacy_text
+def _extract_spacy_triple(doc, model_subject: str, model_object: str, source_text: str) -> tuple[str | None, str | None, str | None]:
+    """Extract subject, predicate, object from spaCy doc."""
+    # Find the root verb
+    root = None
+    for token in doc:
+        if token.dep_ == "ROOT":
+            root = token
+            break
+    if root is None:
+        return None, None, None
+    # Extract predicate from root verb
+    predicate = None
+    if root.pos_ == "VERB":
+        predicate = _extract_verb_phrase(root)
+    elif root.pos_ == "AUX":
+        predicate = root.text
+    # Extract subject (nsubj, nsubjpass)
+    subject = None
+    for child in root.children:
+        if child.dep_ in ("nsubj", "nsubjpass"):
+            subject = _get_full_noun_phrase(child)
+            break
+    # If no direct subject, check parent
+    if subject is None and root.head != root:
+        for child in root.head.children:
+            if child.dep_ in ("nsubj", "nsubjpass"):
+                subject = _get_full_noun_phrase(child)
+                break
+    # Extract object (dobj, pobj, attr, oprd)
+    obj = None
+    for child in root.children:
+        if child.dep_ in ("dobj", "attr", "oprd"):
+            obj = _get_full_noun_phrase(child)
+            break
+        elif child.dep_ == "prep":
+            for pchild in child.children:
+                if pchild.dep_ == "pobj":
+                    obj = _get_full_noun_phrase(pchild)
+                    break
+            if obj:
+                break
+        elif child.dep_ == "agent":
+            for pchild in child.children:
+                if pchild.dep_ == "pobj":
+                    obj = _get_full_noun_phrase(pchild)
+                    break
+            if obj:
+                break
+    # Match against model values for better entity boundaries
+    if subject:
+        subject = _match_entity_boundaries(subject, model_subject, source_text)
+    if obj:
+        obj = _match_entity_boundaries(obj, model_object, source_text)
+    return subject, predicate, obj
+def extract_triple_from_text(
+    source_text: str,
+    model_subject: str,
+    model_object: str,
+    model_predicate: str,
+) -> tuple[str, str, str] | None:
+    """
+    Extract subject, predicate, object from source text using spaCy.
+    Returns a spaCy-based triple that can be added to the candidate pool
+    alongside the model's triple. The existing scoring/dedup logic will
+    pick the best one.
+    Args:
+        source_text: The source sentence to analyze
+        model_subject: Subject from T5-Gemma (used for entity boundary matching)
+        model_object: Object from T5-Gemma (used for entity boundary matching)
+        model_predicate: Predicate from T5-Gemma (unused, kept for API compat)
+    Returns:
+        Tuple of (subject, predicate, object) from spaCy, or None if extraction fails
+    """
+    if not source_text:
+        return None
+    try:
+        nlp = _get_nlp()
+        doc = nlp(source_text)
+        spacy_subject, spacy_predicate, spacy_object = _extract_spacy_triple(
+            doc, model_subject, model_object, source_text
+        )
+        # Only return if we got at least a predicate
+        if spacy_predicate:
+            logger.debug(
+                f"spaCy extracted: subj='{spacy_subject}', pred='{spacy_predicate}', obj='{spacy_object}'"
+            )
+            return (
+                spacy_subject or model_subject,
+                spacy_predicate,
+                spacy_object or model_object,
+            )
+        return None
+    except OSError as e:
+        logger.debug(f"Cannot load spaCy model: {e}")
+        return None
+    except Exception as e:
+        logger.debug(f"spaCy extraction failed: {e}")
+        return None
+def extract_triple_by_predicate_split(
+    source_text: str,
+    predicate: str,
+) -> tuple[str, str, str] | None:
+    """
+    Extract subject and object by splitting the source text around the predicate.
+    This is useful when the predicate is known but subject/object boundaries
+    are uncertain. Uses the predicate as an anchor point.
+    Args:
+        source_text: The source sentence
+        predicate: The predicate (verb phrase) to split on
+    Returns:
+        Tuple of (subject, predicate, object) or None if split fails
+    """
+    if not source_text or not predicate:
+        return None
+    # Find the predicate in the source text (case-insensitive)
+    source_lower = source_text.lower()
+    pred_lower = predicate.lower()
+    pred_pos = source_lower.find(pred_lower)
+    if pred_pos < 0:
+        # Try finding just the main verb (first word of predicate)
+        main_verb = pred_lower.split()[0] if pred_lower.split() else ""
+        if main_verb and len(main_verb) > 2:
+            pred_pos = source_lower.find(main_verb)
+            if pred_pos >= 0:
+                # Adjust to use the actual predicate length for splitting
+                predicate = main_verb
+    if pred_pos < 0:
+        return None
+    # Extract subject (text before predicate, trimmed)
+    subject = source_text[:pred_pos].strip()
+    # Extract object (text after predicate, trimmed)
+    pred_end = pred_pos + len(predicate)
+    obj = source_text[pred_end:].strip()
+    # Clean up: remove trailing punctuation from object
+    obj = obj.rstrip('.,;:!?')
+    # Clean up: remove leading articles/prepositions from object if very short
+    obj_words = obj.split()
+    if obj_words and obj_words[0].lower() in ('a', 'an', 'the', 'to', 'of', 'for'):
+        if len(obj_words) > 1:
+            obj = ' '.join(obj_words[1:])
+    # Validate: both subject and object should have meaningful content
+    if len(subject) < 2 or len(obj) < 2:
+        return None
+    logger.debug(
+        f"Predicate-split extracted: subj='{subject}', pred='{predicate}', obj='{obj}'"
+    )
+    return (subject, predicate, obj)
+# Keep old function for backwards compatibility
+def infer_predicate(
+    subject: str,
+    obj: str,
+    source_text: str,
+) -> Optional[str]:
+    """
+    Infer the predicate from source text using dependency parsing.
+    DEPRECATED: Use extract_triple_from_text instead.
+    """
+    result = extract_triple_from_text(
+        source_text=source_text,
+        model_subject=subject,
+        model_object=obj,
+        model_predicate="",
+    )
+    if result:
+        _, predicate, _ = result
+        return predicate if predicate else None
+    return None

corp_extractor-0.2.11.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
-statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
-statement_extractor/cli.py,sha256=NIGCpqcnzF42B16RCiSu4kN0RlnVne2ZAT8341Znt1g,8558
-statement_extractor/extractor.py,sha256=r2gcCfZT43Q8STPuzaXmhbjWXTAs4JwMeAtCjQxlsIQ,25870
-statement_extractor/models.py,sha256=IE3TyIiOl2CINPMroQnGT12rSeQFR0bV3y4BJ79wLmI,10877
-statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
-statement_extractor/scoring.py,sha256=xs0SxrV42QNBULQguU1-HhcCc-HnS-ekbcdx7FqWGVk,15663
-corp_extractor-0.2.11.dist-info/METADATA,sha256=D-fs9i9kn4v5bRAHCHxI3cq_6vosNgDCN7uuYwVZztM,13775
-corp_extractor-0.2.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-corp_extractor-0.2.11.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
-corp_extractor-0.2.11.dist-info/RECORD,,

{corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

corp-extractor 0.2.11__py3-none-any.whl → 0.3.0__py3-none-any.whl

corp-extractor 0.2.11py3-none-any.whl → 0.3.0py3-none-any.whl