PyPI - corp-extractor - Versions diffs - 0.2.11__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

corp-extractor 0.2.11py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/METADATA +140 -33
corp_extractor-0.4.0.dist-info/RECORD +12 -0
statement_extractor/__init__.py +3 -1
statement_extractor/cli.py +20 -0
statement_extractor/extractor.py +312 -22
statement_extractor/gliner_extraction.py +288 -0
statement_extractor/models.py +33 -1
statement_extractor/scoring.py +108 -90
corp_extractor-0.2.11.dist-info/RECORD +0 -11
{corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/entry_points.txt +0 -0

statement_extractor/gliner_extraction.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""
+GLiNER2-based triple extraction.
+Uses GLiNER2 for relation extraction and entity recognition to extract
+subject, predicate, and object from source text. T5-Gemma model provides
+triple structure and coreference resolution, while GLiNER2 handles
+linguistic analysis.
+The GLiNER2 model is loaded automatically on first use.
+"""
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Lazy-loaded GLiNER2 model
+_model = None
+def _get_model():
+    """
+    Lazy-load the GLiNER2 model.
+    Uses the base model (205M parameters) which is CPU-optimized.
+    """
+    global _model
+    if _model is None:
+        from gliner2 import GLiNER2
+        logger.info("Loading GLiNER2 model 'fastino/gliner2-base-v1'...")
+        _model = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
+        logger.debug("GLiNER2 model loaded")
+    return _model
+def extract_triple_from_text(
+    source_text: str,
+    model_subject: str,
+    model_object: str,
+    model_predicate: str,
+    predicates: Optional[list[str]] = None,
+) -> tuple[str, str, str] | None:
+    """
+    Extract subject, predicate, object from source text using GLiNER2.
+    Returns a GLiNER2-based triple that can be added to the candidate pool
+    alongside the model's triple. The existing scoring/dedup logic will
+    pick the best one.
+    Args:
+        source_text: The source sentence to analyze
+        model_subject: Subject from T5-Gemma (used for matching and fallback)
+        model_object: Object from T5-Gemma (used for matching and fallback)
+        model_predicate: Predicate from T5-Gemma (used when no predicates provided)
+        predicates: Optional list of predefined relation types to extract
+    Returns:
+        Tuple of (subject, predicate, object) from GLiNER2, or None if extraction fails
+    """
+    if not source_text:
+        return None
+    try:
+        model = _get_model()
+        if predicates:
+            # Use relation extraction with predefined predicates
+            result = model.extract_relations(source_text, predicates)
+            # Find best matching relation
+            relation_data = result.get("relation_extraction", {})
+            best_match = None
+            best_confidence = 0.0
+            for rel_type, relations in relation_data.items():
+                for rel in relations:
+                    # Handle both tuple format and dict format
+                    if isinstance(rel, tuple):
+                        head, tail = rel
+                        confidence = 1.0
+                    else:
+                        head = rel.get("head", {}).get("text", "")
+                        tail = rel.get("tail", {}).get("text", "")
+                        confidence = min(
+                            rel.get("head", {}).get("confidence", 0.5),
+                            rel.get("tail", {}).get("confidence", 0.5)
+                        )
+                    # Score based on match with model hints
+                    score = confidence
+                    if model_subject.lower() in head.lower() or head.lower() in model_subject.lower():
+                        score += 0.2
+                    if model_object.lower() in tail.lower() or tail.lower() in model_object.lower():
+                        score += 0.2
+                    if score > best_confidence:
+                        best_confidence = score
+                        best_match = (head, rel_type, tail)
+            if best_match:
+                logger.debug(
+                    f"GLiNER2 extracted (relation): subj='{best_match[0]}', pred='{best_match[1]}', obj='{best_match[2]}'"
+                )
+                return best_match
+        else:
+            # No predicate list provided - use GLiNER2 for entity extraction
+            # and extract predicate from source text using the model's hint
+            # Extract entities to refine subject/object boundaries
+            entity_types = [
+                "person", "organization", "company", "location", "city", "country",
+                "product", "event", "date", "money", "quantity"
+            ]
+            result = model.extract_entities(source_text, entity_types)
+            entities = result.get("entities", {})
+            # Find entities that match model subject/object
+            refined_subject = model_subject
+            refined_object = model_object
+            for entity_type, entity_list in entities.items():
+                for entity in entity_list:
+                    entity_lower = entity.lower()
+                    # Check if this entity matches or contains the model's subject/object
+                    if model_subject.lower() in entity_lower or entity_lower in model_subject.lower():
+                        # Use the entity text if it's more complete
+                        if len(entity) >= len(refined_subject):
+                            refined_subject = entity
+                    if model_object.lower() in entity_lower or entity_lower in model_object.lower():
+                        if len(entity) >= len(refined_object):
+                            refined_object = entity
+            # Extract predicate from source text using predicate split
+            predicate_result = extract_triple_by_predicate_split(source_text, model_predicate)
+            if predicate_result:
+                _, extracted_predicate, _ = predicate_result
+            else:
+                extracted_predicate = model_predicate
+            if extracted_predicate:
+                logger.debug(
+                    f"GLiNER2 extracted (entity-refined): subj='{refined_subject}', pred='{extracted_predicate}', obj='{refined_object}'"
+                )
+                return (refined_subject, extracted_predicate, refined_object)
+        return None
+    except ImportError as e:
+        logger.warning(f"GLiNER2 not installed: {e}")
+        return None
+    except Exception as e:
+        logger.debug(f"GLiNER2 extraction failed: {e}")
+        return None
+def extract_triple_by_predicate_split(
+    source_text: str,
+    predicate: str,
+) -> tuple[str, str, str] | None:
+    """
+    Extract subject and object by splitting the source text around the predicate.
+    This is useful when the predicate is known but subject/object boundaries
+    are uncertain. Uses the predicate as an anchor point.
+    Args:
+        source_text: The source sentence
+        predicate: The predicate (verb phrase) to split on
+    Returns:
+        Tuple of (subject, predicate, object) or None if split fails
+    """
+    if not source_text or not predicate:
+        return None
+    # Find the predicate in the source text (case-insensitive)
+    source_lower = source_text.lower()
+    pred_lower = predicate.lower()
+    pred_pos = source_lower.find(pred_lower)
+    if pred_pos < 0:
+        # Try finding just the main verb (first word of predicate)
+        main_verb = pred_lower.split()[0] if pred_lower.split() else ""
+        if main_verb and len(main_verb) > 2:
+            pred_pos = source_lower.find(main_verb)
+            if pred_pos >= 0:
+                # Adjust to use the actual predicate length for splitting
+                predicate = main_verb
+    if pred_pos < 0:
+        return None
+    # Extract subject (text before predicate, trimmed)
+    subject = source_text[:pred_pos].strip()
+    # Extract object (text after predicate, trimmed)
+    pred_end = pred_pos + len(predicate)
+    obj = source_text[pred_end:].strip()
+    # Clean up: remove trailing punctuation from object
+    obj = obj.rstrip('.,;:!?')
+    # Clean up: remove leading articles/prepositions from object if very short
+    obj_words = obj.split()
+    if obj_words and obj_words[0].lower() in ('a', 'an', 'the', 'to', 'of', 'for'):
+        if len(obj_words) > 1:
+            obj = ' '.join(obj_words[1:])
+    # Validate: both subject and object should have meaningful content
+    if len(subject) < 2 or len(obj) < 2:
+        return None
+    logger.debug(
+        f"Predicate-split extracted: subj='{subject}', pred='{predicate}', obj='{obj}'"
+    )
+    return (subject, predicate, obj)
+def score_entity_content(text: str) -> float:
+    """
+    Score how entity-like a text is using GLiNER2 entity recognition.
+    Returns:
+        1.0 - Recognized as a named entity with high confidence
+        0.8 - Recognized as an entity with moderate confidence
+        0.6 - Partially recognized or contains entity-like content
+        0.2 - Not recognized as any entity type
+    """
+    if not text or not text.strip():
+        return 0.2
+    try:
+        model = _get_model()
+        # Check if text is recognized as common entity types
+        entity_types = [
+            "person", "organization", "company", "location", "city", "country",
+            "product", "event", "date", "money", "quantity"
+        ]
+        result = model.extract_entities(
+            text,
+            entity_types,
+            include_confidence=True
+        )
+        # Result format: {'entities': {'person': [{'text': '...', 'confidence': 0.99}], ...}}
+        entities_dict = result.get("entities", {})
+        # Find best matching entity across all types
+        best_confidence = 0.0
+        text_lower = text.lower().strip()
+        for entity_type, entity_list in entities_dict.items():
+            for entity in entity_list:
+                if isinstance(entity, dict):
+                    entity_text = entity.get("text", "").lower().strip()
+                    confidence = entity.get("confidence", 0.5)
+                else:
+                    # Fallback for string format
+                    entity_text = str(entity).lower().strip()
+                    confidence = 0.8
+                # Check if entity covers most of the input text
+                if entity_text == text_lower:
+                    # Exact match
+                    best_confidence = max(best_confidence, confidence)
+                elif entity_text in text_lower or text_lower in entity_text:
+                    # Partial match - reduce confidence
+                    best_confidence = max(best_confidence, confidence * 0.8)
+        if best_confidence >= 0.9:
+            return 1.0
+        elif best_confidence >= 0.7:
+            return 0.8
+        elif best_confidence >= 0.5:
+            return 0.6
+        elif best_confidence > 0:
+            return 0.4
+        else:
+            return 0.2
+    except Exception as e:
+        logger.debug(f"Entity scoring failed for '{text}': {e}")
+        return 0.5  # Neutral score on error

statement_extractor/models.py CHANGED Viewed

@@ -24,6 +24,14 @@ class EntityType(str, Enum):
     UNKNOWN = "UNKNOWN"
+class ExtractionMethod(str, Enum):
+    """Method used to extract the triple components."""
+    HYBRID = "hybrid"  # Model subject/object + GLiNER2 predicate
+    GLINER = "gliner"  # All components from GLiNER2 extraction
+    SPLIT = "split"  # Subject/object from splitting source text around predicate
+    MODEL = "model"  # All components from T5-Gemma model (when GLiNER2 disabled)
 class Entity(BaseModel):
     """An entity (subject or object) with its text and type."""
     text: str = Field(..., description="The entity text")
@@ -52,12 +60,18 @@ class Statement(BaseModel):
     object: Entity = Field(..., description="The object entity")
     source_text: Optional[str] = Field(None, description="The original text this statement was extracted from")
+    # Extraction method tracking
+    extraction_method: ExtractionMethod = Field(
+        default=ExtractionMethod.MODEL,
+        description="Method used to extract this triple (hybrid, spacy, split, or model)"
+    )
     # Quality scoring fields
     confidence_score: Optional[float] = Field(
         None,
         ge=0.0,
         le=1.0,
-        description="Groundedness score (0-1) indicating how well the triple is supported by source text"
+        description="Semantic similarity score (0-1) between source text and reassembled triple"
     )
     evidence_span: Optional[tuple[int, int]] = Field(
         None,
@@ -99,6 +113,7 @@ class Statement(BaseModel):
             object=merged_object,
             predicate=self.predicate,
             source_text=self.source_text,
+            extraction_method=self.extraction_method,
             confidence_score=self.confidence_score,
             evidence_span=self.evidence_span,
             canonical_predicate=self.canonical_predicate,
@@ -116,6 +131,7 @@ class Statement(BaseModel):
             object=self.subject,
             predicate=self.predicate,
             source_text=self.source_text,
+            extraction_method=self.extraction_method,
             confidence_score=self.confidence_score,
             evidence_span=self.evidence_span,
             canonical_predicate=self.canonical_predicate,
@@ -279,6 +295,16 @@ class ExtractionOptions(BaseModel):
         default=True,
         description="Use embedding similarity for predicate deduplication"
     )
+    use_gliner_extraction: bool = Field(
+        default=True,
+        description="Use GLiNER2 for predicate/subject/object extraction (model provides structure + coreference)"
+    )
+    # GLiNER2 predicate configuration
+    predicates: Optional[list[str]] = Field(
+        default=None,
+        description="Optional list of predefined predicate types for GLiNER2 relation extraction (e.g., ['works_for', 'founded'])"
+    )
     # Verbose logging
     verbose: bool = Field(
@@ -286,5 +312,11 @@ class ExtractionOptions(BaseModel):
         description="Enable verbose logging for debugging"
     )
+    # Triple selection
+    all_triples: bool = Field(
+        default=False,
+        description="Keep all candidate triples instead of selecting the highest-scoring one per source"
+    )
     class Config:
         arbitrary_types_allowed = True  # Allow Callable type

statement_extractor/scoring.py CHANGED Viewed

@@ -2,13 +2,15 @@
 Scoring module for statement extraction quality assessment.
 Provides:
-- TripleScorer: Score individual triples for groundedness
+- TripleScorer: Score individual triples combining semantic similarity and grammatical accuracy
 - BeamScorer: Score and select/merge beams based on quality metrics
 """
 import logging
 from typing import Optional
+import numpy as np
 from .models import ScoringConfig, Statement
 logger = logging.getLogger(__name__)
@@ -16,62 +18,126 @@ logger = logging.getLogger(__name__)
 class TripleScorer:
     """
-    Score individual triples for groundedness in source text.
-    Groundedness is measured by checking:
-    - Subject text appears in source
-    - Object text appears in source
-    - Subject and object are in proximity (same/nearby sentences)
-    - Evidence span exists and is valid
+    Score individual triples combining semantic similarity and entity recognition.
+    The score is a weighted combination of:
+    - Semantic similarity (50%): Cosine similarity between source text and reassembled triple
+    - Subject entity score (25%): How entity-like the subject is (via GLiNER2)
+    - Object entity score (25%): How entity-like the object is (via GLiNER2)
+    Entity scoring (via GLiNER2):
+    - Recognized entity with high confidence: 1.0
+    - Recognized entity with moderate confidence: 0.8
+    - Partially recognized: 0.6
+    - Not recognized: 0.2
     """
-    def __init__(self, config: Optional[ScoringConfig] = None):
+    def __init__(
+        self,
+        config: Optional[ScoringConfig] = None,
+        device: Optional[str] = None,
+    ):
         self.config = config or ScoringConfig()
+        # Auto-detect device
+        if device is None:
+            import torch
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        # Lazy-loaded embedding model
+        self._model = None
+        self._embedding_model_name = "all-MiniLM-L6-v2"
+    def _load_model(self):
+        """Load sentence-transformers model lazily."""
+        if self._model is not None:
+            return
+        from sentence_transformers import SentenceTransformer
+        logger.debug(f"Loading embedding model: {self._embedding_model_name} on {self.device}")
+        self._model = SentenceTransformer(self._embedding_model_name, device=self.device)
+        logger.debug(f"Embedding model loaded on {self.device}")
+    def _compute_embeddings(self, texts: list[str]) -> np.ndarray:
+        """Compute embeddings for a list of texts."""
+        self._load_model()
+        return self._model.encode(texts, convert_to_numpy=True)
+    def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
+        """Compute cosine similarity between two vectors."""
+        dot = np.dot(vec1, vec2)
+        norm1 = np.linalg.norm(vec1)
+        norm2 = np.linalg.norm(vec2)
+        if norm1 == 0 or norm2 == 0:
+            return 0.0
+        return float(dot / (norm1 * norm2))
+    def _score_noun_content(self, text: str) -> float:
+        """
+        Score how entity-like a text is using GLiNER2 entity recognition.
+        Returns:
+            1.0 - Recognized as a named entity with high confidence
+            0.8 - Recognized as an entity with moderate confidence
+            0.6 - Partially recognized or contains entity-like content
+            0.2 - Not recognized as any entity type
+        """
+        if not text or not text.strip():
+            return 0.2
+        try:
+            from .gliner_extraction import score_entity_content
+            return score_entity_content(text)
+        except Exception as e:
+            logger.debug(f"Entity scoring failed for '{text}': {e}")
+            return 0.5  # Neutral score on error
     def score_triple(self, statement: Statement, source_text: str) -> float:
         """
-        Score a triple's groundedness (0-1).
+        Score a triple's quality (0-1) combining semantic similarity and grammatical accuracy.
+        The score is a weighted combination of:
+        - Semantic similarity (50%): How well the triple captures the source meaning
+        - Subject noun score (25%): Grammatical quality of subject
+        - Object noun score (25%): Grammatical quality of object
-        Higher scores indicate better grounding in source text.
+        Higher scores indicate better overall quality.
         """
-        if not source_text:
+        # Use statement's source_text if available, otherwise use provided source_text
+        reference_text = statement.source_text or source_text
+        if not reference_text:
             logger.debug(f"  No source text, returning neutral score 0.5")
             return 0.5  # Neutral score if no source text
-        score = 0.0
-        weights_sum = 0.0
-        # Check subject appears in source (weight: 0.3)
-        subject_found = self._text_appears_in(statement.subject.text, source_text)
-        score += 0.3 * (1.0 if subject_found else 0.0)
-        weights_sum += 0.3
-        # Check object appears in source (weight: 0.3)
-        object_found = self._text_appears_in(statement.object.text, source_text)
-        score += 0.3 * (1.0 if object_found else 0.0)
-        weights_sum += 0.3
-        # Check predicate has lexical trigger (weight: 0.2)
-        predicate_grounded = self._predicate_has_trigger(statement.predicate, source_text)
-        score += 0.2 * (1.0 if predicate_grounded else 0.0)
-        weights_sum += 0.2
-        # Check proximity - subject and object in same/nearby region (weight: 0.2)
-        proximity_score = 0.0
-        if subject_found and object_found:
-            proximity_score = self._compute_proximity(
-                statement.subject.text,
-                statement.object.text,
-                source_text
-            )
-            score += 0.2 * proximity_score
-        weights_sum += 0.2
+        # Reassemble the triple
+        reassembled = f"{statement.subject.text} {statement.predicate} {statement.object.text}"
+        # Compute semantic similarity
+        embeddings = self._compute_embeddings([reference_text, reassembled])
+        semantic_similarity = self._cosine_similarity(embeddings[0], embeddings[1])
+        # Compute grammatical scores for subject and object
+        subject_noun_score = self._score_noun_content(statement.subject.text)
+        object_noun_score = self._score_noun_content(statement.object.text)
-        final_score = score / weights_sum if weights_sum > 0 else 0.0
+        # Weighted combination: 50% semantic, 25% subject, 25% object
+        final_score = (
+            semantic_similarity * 0.5 +
+            subject_noun_score * 0.25 +
+            object_noun_score * 0.25
+        )
         logger.debug(
             f"  Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
-            f"{final_score:.2f} (subj={subject_found}, obj={object_found}, pred={predicate_grounded}, prox={proximity_score:.2f})"
+            f"{final_score:.3f} (semantic={semantic_similarity:.2f}, subj_noun={subject_noun_score:.2f}, obj_noun={object_noun_score:.2f})"
         )
         return final_score
@@ -115,54 +181,6 @@ class TripleScorer:
         return None
-    def _text_appears_in(self, text: str, source: str) -> bool:
-        """Check if text appears in source (case-insensitive)."""
-        return text.lower() in source.lower()
-    def _predicate_has_trigger(self, predicate: str, source: str) -> bool:
-        """Check if predicate has a lexical trigger in source."""
-        # Extract main verb/word from predicate
-        words = predicate.lower().split()
-        source_lower = source.lower()
-        # Check if any predicate word appears in source
-        for word in words:
-            if len(word) > 2 and word in source_lower:
-                return True
-        return False
-    def _compute_proximity(
-        self,
-        subject_text: str,
-        object_text: str,
-        source: str
-    ) -> float:
-        """
-        Compute proximity score (0-1) based on distance between subject and object.
-        Returns 1.0 if same sentence, decreasing with distance.
-        """
-        source_lower = source.lower()
-        subj_pos = source_lower.find(subject_text.lower())
-        obj_pos = source_lower.find(object_text.lower())
-        if subj_pos < 0 or obj_pos < 0:
-            return 0.0
-        # Check if in same sentence
-        start = min(subj_pos, obj_pos)
-        end = max(subj_pos, obj_pos)
-        region = source[start:end]
-        # If no sentence boundary between them, high proximity
-        if '.' not in region and '!' not in region and '?' not in region:
-            return 1.0
-        # Otherwise, score decreases with distance
-        # Assume ~100 chars per sentence on average
-        sentence_distance = region.count('.') + region.count('!') + region.count('?')
-        return max(0.0, 1.0 - (sentence_distance * 0.2))
     def _extend_to_sentence(
         self,
         source: str,

corp_extractor-0.2.11.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
-statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
-statement_extractor/cli.py,sha256=NIGCpqcnzF42B16RCiSu4kN0RlnVne2ZAT8341Znt1g,8558
-statement_extractor/extractor.py,sha256=r2gcCfZT43Q8STPuzaXmhbjWXTAs4JwMeAtCjQxlsIQ,25870
-statement_extractor/models.py,sha256=IE3TyIiOl2CINPMroQnGT12rSeQFR0bV3y4BJ79wLmI,10877
-statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
-statement_extractor/scoring.py,sha256=xs0SxrV42QNBULQguU1-HhcCc-HnS-ekbcdx7FqWGVk,15663
-corp_extractor-0.2.11.dist-info/METADATA,sha256=D-fs9i9kn4v5bRAHCHxI3cq_6vosNgDCN7uuYwVZztM,13775
-corp_extractor-0.2.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-corp_extractor-0.2.11.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
-corp_extractor-0.2.11.dist-info/RECORD,,

{corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{corp_extractor-0.2.11.dist-info → corp_extractor-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

corp-extractor 0.2.11__py3-none-any.whl → 0.4.0__py3-none-any.whl

corp-extractor 0.2.11py3-none-any.whl → 0.4.0py3-none-any.whl