PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
corp_extractor-0.5.0.dist-info/RECORD +55 -0
statement_extractor/__init__.py +9 -0
statement_extractor/cli.py +446 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +1182 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +74 -0
statement_extractor/models/canonical.py +139 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +191 -0
statement_extractor/models/qualifiers.py +91 -0
statement_extractor/models/statement.py +75 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +134 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +447 -0
statement_extractor/pipeline/registry.py +297 -0
statement_extractor/plugins/__init__.py +43 -0
statement_extractor/plugins/base.py +446 -0
statement_extractor/plugins/canonicalizers/__init__.py +17 -0
statement_extractor/plugins/canonicalizers/base.py +9 -0
statement_extractor/plugins/canonicalizers/location.py +219 -0
statement_extractor/plugins/canonicalizers/organization.py +230 -0
statement_extractor/plugins/canonicalizers/person.py +242 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +536 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +373 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
statement_extractor/plugins/qualifiers/__init__.py +19 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +174 -0
statement_extractor/plugins/qualifiers/gleif.py +186 -0
statement_extractor/plugins/qualifiers/person.py +221 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +188 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +337 -0
statement_extractor/plugins/taxonomy/mnli.py +279 -0
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0

statement_extractor/extractor.py CHANGED Viewed

@@ -783,7 +783,7 @@ class StatementExtractor:
                 if use_gliner_extraction and source_text:
                     try:
-                        from .gliner_extraction import extract_triple_from_text, extract_triple_by_predicate_split
+                        from .gliner_extraction import extract_triple_from_text
                         # Get model predicate for fallback/refinement
                         predicate_elem = stmt_elem.find('predicate')
@@ -826,28 +826,6 @@ class StatementExtractor:
                                             source_text=source_text,
                                             extraction_method=ExtractionMethod.GLINER,
                                         ))
-                                # Candidate 3: Predicate-split (split source text around predicate)
-                                split_result = extract_triple_by_predicate_split(
-                                    source_text=source_text,
-                                    predicate=gliner_pred,
-                                )
-                                if split_result:
-                                    split_subj, split_pred, split_obj = split_result
-                                    # Only add if different from previous candidates
-                                    is_different_from_hybrid = (split_subj != subject_text or split_obj != object_text)
-                                    is_different_from_gliner = (split_subj != gliner_subj or split_obj != gliner_obj)
-                                    if is_different_from_hybrid and is_different_from_gliner:
-                                        logger.debug(
-                                            f"Adding predicate-split candidate: '{split_subj}' --[{split_pred}]--> '{split_obj}'"
-                                        )
-                                        statements.append(Statement(
-                                            subject=Entity(text=split_subj, type=subject_type),
-                                            predicate=split_pred,
-                                            object=Entity(text=split_obj, type=object_type),
-                                            source_text=source_text,
-                                            extraction_method=ExtractionMethod.SPLIT,
-                                        ))
                             else:
                                 logger.debug(
                                     f"GLiNER2 found no predicate for: '{subject_text}' --> '{object_text}'"

statement_extractor/gliner_extraction.py CHANGED Viewed

@@ -132,18 +132,12 @@ def extract_triple_from_text(
                         if len(entity) >= len(refined_object):
                             refined_object = entity
-            # Extract predicate from source text using predicate split
-            predicate_result = extract_triple_by_predicate_split(source_text, model_predicate)
-            if predicate_result:
-                _, extracted_predicate, _ = predicate_result
-            else:
-                extracted_predicate = model_predicate
-            if extracted_predicate:
+            # Use model predicate directly (T5-Gemma provides the predicate)
+            if model_predicate:
                 logger.debug(
-                    f"GLiNER2 extracted (entity-refined): subj='{refined_subject}', pred='{extracted_predicate}', obj='{refined_object}'"
+                    f"GLiNER2 extracted (entity-refined): subj='{refined_subject}', pred='{model_predicate}', obj='{refined_object}'"
                 )
-                return (refined_subject, extracted_predicate, refined_object)
+                return (refined_subject, model_predicate, refined_object)
         return None
@@ -155,70 +149,6 @@ def extract_triple_from_text(
         return None
-def extract_triple_by_predicate_split(
-    source_text: str,
-    predicate: str,
-) -> tuple[str, str, str] | None:
-    """
-    Extract subject and object by splitting the source text around the predicate.
-    This is useful when the predicate is known but subject/object boundaries
-    are uncertain. Uses the predicate as an anchor point.
-    Args:
-        source_text: The source sentence
-        predicate: The predicate (verb phrase) to split on
-    Returns:
-        Tuple of (subject, predicate, object) or None if split fails
-    """
-    if not source_text or not predicate:
-        return None
-    # Find the predicate in the source text (case-insensitive)
-    source_lower = source_text.lower()
-    pred_lower = predicate.lower()
-    pred_pos = source_lower.find(pred_lower)
-    if pred_pos < 0:
-        # Try finding just the main verb (first word of predicate)
-        main_verb = pred_lower.split()[0] if pred_lower.split() else ""
-        if main_verb and len(main_verb) > 2:
-            pred_pos = source_lower.find(main_verb)
-            if pred_pos >= 0:
-                # Adjust to use the actual predicate length for splitting
-                predicate = main_verb
-    if pred_pos < 0:
-        return None
-    # Extract subject (text before predicate, trimmed)
-    subject = source_text[:pred_pos].strip()
-    # Extract object (text after predicate, trimmed)
-    pred_end = pred_pos + len(predicate)
-    obj = source_text[pred_end:].strip()
-    # Clean up: remove trailing punctuation from object
-    obj = obj.rstrip('.,;:!?')
-    # Clean up: remove leading articles/prepositions from object if very short
-    obj_words = obj.split()
-    if obj_words and obj_words[0].lower() in ('a', 'an', 'the', 'to', 'of', 'for'):
-        if len(obj_words) > 1:
-            obj = ' '.join(obj_words[1:])
-    # Validate: both subject and object should have meaningful content
-    if len(subject) < 2 or len(obj) < 2:
-        return None
-    logger.debug(
-        f"Predicate-split extracted: subj='{subject}', pred='{predicate}', obj='{obj}'"
-    )
-    return (subject, predicate, obj)
 def score_entity_content(text: str) -> float:
     """
     Score how entity-like a text is using GLiNER2 entity recognition.

statement_extractor/llm.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+LLM module for text generation using local models.
+Supports:
+- GGUF models via llama-cpp-python (efficient quantized inference)
+- Transformers models via HuggingFace
+Usage:
+    from statement_extractor.llm import LLM
+    llm = LLM()  # Uses default Gemma3 12B GGUF
+    response = llm.generate("Your prompt here")
+"""
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class LLM:
+    """
+    LLM wrapper for text generation.
+    Automatically selects the best backend:
+    - GGUF models use llama-cpp-python (efficient, no de-quantization)
+    - Other models use HuggingFace transformers
+    """
+    def __init__(
+        self,
+        model_id: str = "google/gemma-3-12b-it-qat-q4_0-gguf",
+        gguf_file: Optional[str] = None,
+        n_ctx: int = 8192,
+        use_4bit: bool = True,
+    ):
+        """
+        Initialize the LLM.
+        Args:
+            model_id: HuggingFace model ID
+            gguf_file: GGUF filename (auto-detected if model_id ends with -gguf)
+            n_ctx: Context size for GGUF models
+            use_4bit: Use 4-bit quantization for transformers models
+        """
+        self._model_id = model_id
+        self._gguf_file = gguf_file
+        self._n_ctx = n_ctx
+        self._use_4bit = use_4bit
+        # Model instances (lazy loaded)
+        self._llama_model = None  # llama-cpp-python
+        self._transformers_model = None  # HuggingFace transformers
+        self._tokenizer = None
+        self._load_failed = False
+    @property
+    def is_loaded(self) -> bool:
+        """Check if the model is loaded."""
+        return self._llama_model is not None or self._transformers_model is not None
+    def _is_gguf_model(self) -> bool:
+        """Check if the model ID is a GGUF model."""
+        return self._model_id.endswith("-gguf") or self._gguf_file is not None
+    def _get_gguf_filename(self) -> str:
+        """Get the GGUF filename from the model ID."""
+        if self._gguf_file:
+            return self._gguf_file
+        # Extract filename from model ID like "google/gemma-3-12b-it-qat-q4_0-gguf"
+        # The actual file is "gemma-3-12b-it-q4_0.gguf" (note: "qat" is removed)
+        model_name = self._model_id.split("/")[-1]
+        if model_name.endswith("-gguf"):
+            model_name = model_name[:-5]  # Remove "-gguf" suffix
+        # Remove "-qat" from the name (it's not in the actual filename)
+        model_name = model_name.replace("-qat", "")
+        return model_name + ".gguf"
+    def load(self) -> None:
+        """
+        Load the model.
+        Raises:
+            RuntimeError: If the model fails to load
+        """
+        if self.is_loaded or self._load_failed:
+            return
+        try:
+            logger.debug(f"Loading LLM: {self._model_id}")
+            if self._is_gguf_model():
+                self._load_gguf_model()
+            else:
+                self._load_transformers_model()
+            logger.debug("LLM loaded successfully")
+        except Exception as e:
+            self._load_failed = True
+            error_msg = f"Failed to load LLM ({self._model_id}): {e}"
+            if "llama_cpp" in str(e).lower() or "llama-cpp" in str(e).lower():
+                error_msg += "\n  Install with: pip install llama-cpp-python"
+            if "accelerate" in str(e):
+                error_msg += "\n  Install with: pip install accelerate"
+            raise RuntimeError(error_msg) from e
+    def _load_gguf_model(self) -> None:
+        """Load GGUF model using llama-cpp-python."""
+        try:
+            from llama_cpp import Llama
+            from huggingface_hub import hf_hub_download
+        except ImportError as e:
+            raise ImportError(
+                "llama-cpp-python is required for GGUF models. "
+                "Install with: pip install llama-cpp-python"
+            ) from e
+        gguf_file = self._get_gguf_filename()
+        logger.debug(f"Loading GGUF model with file: {gguf_file}")
+        # Download the GGUF file from HuggingFace
+        model_path = hf_hub_download(
+            repo_id=self._model_id,
+            filename=gguf_file,
+        )
+        # Load with llama-cpp-python
+        self._llama_model = Llama(
+            model_path=model_path,
+            n_ctx=self._n_ctx,
+            n_gpu_layers=-1,  # Use all GPU layers (Metal on Mac, CUDA on Linux)
+            verbose=False,
+        )
+    def _load_transformers_model(self) -> None:
+        """Load model using HuggingFace transformers."""
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        self._tokenizer = AutoTokenizer.from_pretrained(self._model_id)
+        if self._use_4bit:
+            try:
+                from transformers import BitsAndBytesConfig
+                quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                )
+                self._transformers_model = AutoModelForCausalLM.from_pretrained(
+                    self._model_id,
+                    quantization_config=quantization_config,
+                    device_map="auto",
+                )
+            except ImportError:
+                logger.debug("bitsandbytes not available, loading full precision")
+                self._transformers_model = AutoModelForCausalLM.from_pretrained(
+                    self._model_id,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                )
+        else:
+            self._transformers_model = AutoModelForCausalLM.from_pretrained(
+                self._model_id,
+                device_map="auto",
+                torch_dtype=torch.float16,
+            )
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: int = 100,
+        stop: Optional[list[str]] = None,
+    ) -> str:
+        """
+        Generate text from a prompt.
+        Args:
+            prompt: The input prompt
+            max_tokens: Maximum tokens to generate
+            stop: Stop sequences
+        Returns:
+            Generated text (not including the prompt)
+        """
+        self.load()
+        if self._llama_model is not None:
+            return self._generate_with_llama(prompt, max_tokens, stop)
+        else:
+            return self._generate_with_transformers(prompt, max_tokens)
+    def _generate_with_llama(
+        self,
+        prompt: str,
+        max_tokens: int,
+        stop: Optional[list[str]],
+    ) -> str:
+        """Generate response using llama-cpp-python."""
+        output = self._llama_model(
+            prompt,
+            max_tokens=max_tokens,
+            stop=stop or ["\n\n", "</s>"],
+            echo=False,
+        )
+        return output["choices"][0]["text"]
+    def _generate_with_transformers(
+        self,
+        prompt: str,
+        max_tokens: int,
+    ) -> str:
+        """Generate response using transformers."""
+        import torch
+        inputs = self._tokenizer(prompt, return_tensors="pt").to(self._transformers_model.device)
+        with torch.no_grad():
+            outputs = self._transformers_model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                do_sample=False,
+                pad_token_id=self._tokenizer.pad_token_id,
+            )
+        return self._tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Singleton instance for shared use
+_default_llm: Optional[LLM] = None
+def get_llm(
+    model_id: str = "google/gemma-3-12b-it-qat-q4_0-gguf",
+    **kwargs,
+) -> LLM:
+    """
+    Get or create a shared LLM instance.
+    Uses a singleton pattern to avoid loading the model multiple times.
+    Args:
+        model_id: HuggingFace model ID
+        **kwargs: Additional arguments passed to LLM constructor
+    Returns:
+        LLM instance
+    """
+    global _default_llm
+    if _default_llm is None or _default_llm._model_id != model_id:
+        _default_llm = LLM(model_id=model_id, **kwargs)
+    return _default_llm

statement_extractor/models/__init__.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+Data models for the extraction pipeline.
+This module contains all Pydantic models used throughout the pipeline stages:
+- Stage 1 (Splitting): RawTriple
+- Stage 2 (Extraction): ExtractedEntity, PipelineStatement
+- Stage 3 (Qualification): EntityQualifiers, QualifiedEntity
+- Stage 4 (Canonicalization): CanonicalMatch, CanonicalEntity
+- Stage 5 (Labeling): StatementLabel, LabeledStatement
+It also re-exports all models from the original models.py for backward compatibility.
+"""
+# Import from the original models.py file (now a sibling at the same level)
+# We need to import these BEFORE the local modules to avoid circular imports
+import sys
+import importlib.util
+from pathlib import Path
+# Manually load the old models.py to avoid conflict with this package
+_models_py_path = Path(__file__).parent.parent / "models.py"
+if _models_py_path.exists():
+    _spec = importlib.util.spec_from_file_location("_old_models", _models_py_path)
+    _old_models = importlib.util.module_from_spec(_spec)
+    _spec.loader.exec_module(_old_models)
+    # Re-export everything from the old models
+    Entity = _old_models.Entity
+    ExtractionMethod = _old_models.ExtractionMethod
+    Statement = _old_models.Statement
+    ExtractionResult = _old_models.ExtractionResult
+    PredicateMatch = _old_models.PredicateMatch
+    PredicateTaxonomy = _old_models.PredicateTaxonomy
+    PredicateComparisonConfig = _old_models.PredicateComparisonConfig
+    ScoringConfig = _old_models.ScoringConfig
+    ExtractionOptions = _old_models.ExtractionOptions
+    # Use EntityType from old models
+    EntityType = _old_models.EntityType
+else:
+    # Fallback: define locally if old models.py doesn't exist
+    from .entity import EntityType
+# New pipeline models
+from .entity import ExtractedEntity
+from .statement import RawTriple, PipelineStatement
+from .qualifiers import EntityQualifiers, QualifiedEntity
+from .canonical import CanonicalMatch, CanonicalEntity
+from .labels import StatementLabel, LabeledStatement, TaxonomyResult
+__all__ = [
+    # Re-exported from original models.py (backward compatibility)
+    "Entity",
+    "EntityType",
+    "ExtractionMethod",
+    "Statement",
+    "ExtractionResult",
+    "PredicateMatch",
+    "PredicateTaxonomy",
+    "PredicateComparisonConfig",
+    "ScoringConfig",
+    "ExtractionOptions",
+    # New pipeline models
+    "ExtractedEntity",
+    "RawTriple",
+    "PipelineStatement",
+    "EntityQualifiers",
+    "QualifiedEntity",
+    "CanonicalMatch",
+    "CanonicalEntity",
+    "StatementLabel",
+    "LabeledStatement",
+    "TaxonomyResult",
+]

statement_extractor/models/canonical.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""
+Canonical models for the extraction pipeline.
+CanonicalMatch: Result of matching to a canonical form
+CanonicalEntity: Entity with canonical form from Stage 4
+"""
+from typing import Optional
+from pydantic import BaseModel, Field
+from .qualifiers import QualifiedEntity
+class CanonicalMatch(BaseModel):
+    """
+    Result of matching an entity to its canonical form in Stage 4.
+    Contains information about how the match was made and confidence level.
+    """
+    canonical_id: Optional[str] = Field(
+        None,
+        description="ID in canonical database (e.g., LEI, Wikidata QID)"
+    )
+    canonical_name: Optional[str] = Field(
+        None,
+        description="Canonical name/label"
+    )
+    match_method: str = Field(
+        ...,
+        description="How the match was made: 'identifier', 'name_exact', 'name_fuzzy', 'llm_verified'"
+    )
+    match_confidence: float = Field(
+        default=1.0,
+        ge=0.0,
+        le=1.0,
+        description="Confidence in the canonical match"
+    )
+    match_details: Optional[dict] = Field(
+        None,
+        description="Additional details about the match (e.g., fuzzy score, LLM reasoning)"
+    )
+    def is_high_confidence(self, threshold: float = 0.85) -> bool:
+        """Check if this is a high-confidence match."""
+        return self.match_confidence >= threshold
+class CanonicalEntity(BaseModel):
+    """
+    An entity with canonical form from Stage 4 (Canonicalization).
+    Contains the qualified entity plus its canonical match (if found)
+    and a fully qualified name (FQN) for display.
+    """
+    entity_ref: str = Field(..., description="Reference to the original ExtractedEntity")
+    qualified_entity: QualifiedEntity = Field(
+        ...,
+        description="The qualified entity from Stage 3"
+    )
+    canonical_match: Optional[CanonicalMatch] = Field(
+        None,
+        description="Canonical match if found"
+    )
+    fqn: str = Field(
+        ...,
+        description="Fully qualified name, e.g., 'Tim Cook (CEO, Apple Inc)'"
+    )
+    @classmethod
+    def from_qualified(
+        cls,
+        qualified: QualifiedEntity,
+        canonical_match: Optional[CanonicalMatch] = None,
+        fqn: Optional[str] = None,
+    ) -> "CanonicalEntity":
+        """Create a CanonicalEntity from a QualifiedEntity."""
+        if fqn is None:
+            # Generate default FQN from qualifiers
+            fqn = cls._generate_fqn(qualified, canonical_match)
+        return cls(
+            entity_ref=qualified.entity_ref,
+            qualified_entity=qualified,
+            canonical_match=canonical_match,
+            fqn=fqn,
+        )
+    @staticmethod
+    def _generate_fqn(
+        qualified: QualifiedEntity,
+        canonical_match: Optional[CanonicalMatch] = None
+    ) -> str:
+        """
+        Generate a fully qualified name from qualifiers.
+        Examples:
+        - PERSON with role+org: "Tim Cook (CEO, Apple Inc)"
+        - ORG with canonical: "Apple Inc (AAPL)"
+        - PERSON with no qualifiers: "Tim Cook"
+        """
+        # Use canonical name if available, otherwise fall back to original text
+        if canonical_match and canonical_match.canonical_name:
+            base_name = canonical_match.canonical_name
+        else:
+            base_name = qualified.original_text
+        qualifiers = qualified.qualifiers
+        parts = []
+        seen = set()  # Track seen values to avoid duplicates
+        def add_part(value: str) -> None:
+            """Add a part if not already seen (case-insensitive)."""
+            if value and value.lower() not in seen:
+                parts.append(value)
+                seen.add(value.lower())
+        # Add role for PERSON entities
+        if qualifiers.role:
+            add_part(qualifiers.role)
+        # Add organization for PERSON entities
+        if qualifiers.org:
+            add_part(qualifiers.org)
+        # Add ticker for ORG entities
+        if "ticker" in qualifiers.identifiers:
+            add_part(qualifiers.identifiers["ticker"])
+        # Add jurisdiction if relevant
+        if qualifiers.jurisdiction and not qualifiers.org:
+            add_part(qualifiers.jurisdiction)
+        if parts:
+            return f"{base_name} ({', '.join(parts)})"
+        return base_name
+    class Config:
+        frozen = False  # Allow modification during pipeline stages

corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl