PyPI - corp-extractor - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

corp-extractor 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
corp_extractor-0.5.0.dist-info/RECORD +55 -0
statement_extractor/__init__.py +9 -0
statement_extractor/cli.py +460 -21
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +1182 -0
statement_extractor/extractor.py +32 -47
statement_extractor/gliner_extraction.py +218 -0
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +74 -0
statement_extractor/models/canonical.py +139 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +191 -0
statement_extractor/models/qualifiers.py +91 -0
statement_extractor/models/statement.py +75 -0
statement_extractor/models.py +15 -6
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +134 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +447 -0
statement_extractor/pipeline/registry.py +297 -0
statement_extractor/plugins/__init__.py +43 -0
statement_extractor/plugins/base.py +446 -0
statement_extractor/plugins/canonicalizers/__init__.py +17 -0
statement_extractor/plugins/canonicalizers/base.py +9 -0
statement_extractor/plugins/canonicalizers/location.py +219 -0
statement_extractor/plugins/canonicalizers/organization.py +230 -0
statement_extractor/plugins/canonicalizers/person.py +242 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +536 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +373 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
statement_extractor/plugins/qualifiers/__init__.py +19 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +174 -0
statement_extractor/plugins/qualifiers/gleif.py +186 -0
statement_extractor/plugins/qualifiers/person.py +221 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +188 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +337 -0
statement_extractor/plugins/taxonomy/mnli.py +279 -0
statement_extractor/scoring.py +17 -69
corp_extractor-0.3.0.dist-info/RECORD +0 -12
statement_extractor/spacy_extraction.py +0 -386
{corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/labelers/taxonomy_embedding.py ADDED Viewed

@@ -0,0 +1,466 @@
+"""
+EmbeddingTaxonomyLabeler - Classifies statements using embedding similarity.
+Uses sentence-transformers to embed text and compare to pre-computed label
+embeddings using cosine similarity with sigmoid calibration.
+This is faster than MNLI but may be less accurate for nuanced classification.
+"""
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Optional
+import numpy as np
+from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...models import (
+    PipelineStatement,
+    CanonicalEntity,
+    StatementLabel,
+)
+logger = logging.getLogger(__name__)
+# Default taxonomy file location (relative to this module)
+DEFAULT_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "data" / "statement_taxonomy.json"
+# Default categories to use
+DEFAULT_CATEGORIES = [
+    "environment",
+    "society",
+    "governance",
+    "animals",
+    "industry",
+    "human_harm",
+    "human_benefit",
+    "animal_harm",
+    "animal_benefit",
+    "environment_harm",
+    "environment_benefit",
+]
+class EmbeddingClassifier:
+    """
+    Embedding-based classifier using cosine similarity.
+    Pre-computes embeddings for all labels and uses dot product
+    (on normalized vectors) for fast classification.
+    """
+    # Calibration parameters to spread out cosine similarity scores
+    SIMILARITY_THRESHOLD = 0.65
+    CALIBRATION_STEEPNESS = 25.0
+    def __init__(
+        self,
+        model_name: str = "all-MiniLM-L6-v2",
+        device: Optional[str] = None,
+    ):
+        """
+        Initialize the classifier.
+        Args:
+            model_name: sentence-transformers model ID
+            device: Device to use ('cuda', 'mps', 'cpu', or None for auto)
+        """
+        self._model_name = model_name
+        self._device = device
+        self._model = None
+        # Pre-computed label embeddings: {category: {label: embedding}}
+        self._label_embeddings: dict[str, dict[str, np.ndarray]] = {}
+    def _load_model(self):
+        """Lazy-load the embedding model."""
+        if self._model is not None:
+            return
+        try:
+            from sentence_transformers import SentenceTransformer
+            import torch
+            # Auto-detect device
+            device = self._device
+            if device is None:
+                if torch.cuda.is_available():
+                    device = "cuda"
+                elif torch.backends.mps.is_available():
+                    device = "mps"
+                else:
+                    device = "cpu"
+            logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
+            self._model = SentenceTransformer(self._model_name, device=device)
+            logger.debug("Embedding model loaded")
+        except ImportError as e:
+            raise ImportError(
+                "sentence-transformers is required for embedding classification. "
+                "Install with: pip install sentence-transformers"
+            ) from e
+    def precompute_label_embeddings(
+        self,
+        taxonomy: dict[str, dict[str, int]],
+        categories: Optional[list[str]] = None,
+    ) -> None:
+        """
+        Pre-compute embeddings for all label names.
+        Args:
+            taxonomy: Taxonomy dict {category: {label: id, ...}, ...}
+            categories: Categories to include (default: all)
+        """
+        self._load_model()
+        start_time = time.perf_counter()
+        total_labels = 0
+        categories_to_process = categories or list(taxonomy.keys())
+        for category in categories_to_process:
+            if category not in taxonomy:
+                continue
+            labels = taxonomy[category]
+            label_names = list(labels.keys())
+            if not label_names:
+                continue
+            # Batch embed all labels in this category
+            embeddings = self._model.encode(label_names, convert_to_numpy=True)
+            # Normalize and store
+            self._label_embeddings[category] = {}
+            for label_name, embedding in zip(label_names, embeddings):
+                norm = np.linalg.norm(embedding)
+                normalized = embedding / (norm + 1e-8)
+                self._label_embeddings[category][label_name] = normalized.astype(np.float32)
+                total_labels += 1
+        elapsed = time.perf_counter() - start_time
+        logger.info(
+            f"Pre-computed embeddings for {total_labels} labels "
+            f"across {len(self._label_embeddings)} categories in {elapsed:.2f}s"
+        )
+    def _calibrate_score(self, raw_similarity: float) -> float:
+        """
+        Apply sigmoid calibration to amplify score differences.
+        Cosine similarities cluster in a narrow range (0.5-0.9).
+        This transformation spreads them out for better discrimination.
+        """
+        # Normalize from [-1, 1] to [0, 1]
+        normalized = (raw_similarity + 1) / 2
+        # Apply sigmoid transformation
+        exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
+        return 1.0 / (1.0 + np.exp(exponent))
+    def classify(
+        self,
+        text: str,
+        categories: Optional[list[str]] = None,
+        top_k: int = 5,
+        min_score: float = 0.3,
+    ) -> list[tuple[str, str, float]]:
+        """
+        Classify text against pre-computed label embeddings.
+        Args:
+            text: Text to classify
+            categories: Categories to search (default: all pre-computed)
+            top_k: Number of top results to return
+            min_score: Minimum calibrated score threshold
+        Returns:
+            List of (category, label, score) tuples, sorted by score descending
+        """
+        self._load_model()
+        if not self._label_embeddings:
+            raise RuntimeError("Label embeddings not pre-computed. Call precompute_label_embeddings first.")
+        # Embed input text
+        input_embedding = self._model.encode(text, convert_to_numpy=True)
+        input_norm = np.linalg.norm(input_embedding)
+        input_normalized = input_embedding / (input_norm + 1e-8)
+        # Classify against each category
+        categories_to_process = categories or list(self._label_embeddings.keys())
+        all_results: list[tuple[str, str, float]] = []
+        for category in categories_to_process:
+            if category not in self._label_embeddings:
+                continue
+            for label, label_embedding in self._label_embeddings[category].items():
+                # Cosine similarity (both vectors are normalized)
+                raw_sim = float(np.dot(input_normalized, label_embedding))
+                calibrated_score = self._calibrate_score(raw_sim)
+                if calibrated_score >= min_score:
+                    all_results.append((category, label, calibrated_score))
+        # Sort by score descending and return top-k
+        all_results.sort(key=lambda x: x[2], reverse=True)
+        return all_results[:top_k]
+    def classify_hierarchical(
+        self,
+        text: str,
+        top_k_categories: int = 3,
+        top_k_labels: int = 3,
+        min_score: float = 0.3,
+    ) -> tuple[str, str, float]:
+        """
+        Hierarchical classification: find best category, then best label.
+        More efficient for very large taxonomies.
+        Args:
+            text: Text to classify
+            top_k_categories: Number of top categories to consider
+            top_k_labels: Number of labels per category to consider
+            min_score: Minimum score threshold
+        Returns:
+            Tuple of (category, label, score) for best match
+        """
+        self._load_model()
+        if not self._label_embeddings:
+            raise RuntimeError("Label embeddings not pre-computed.")
+        # Embed input text
+        input_embedding = self._model.encode(text, convert_to_numpy=True)
+        input_norm = np.linalg.norm(input_embedding)
+        input_normalized = input_embedding / (input_norm + 1e-8)
+        # First, compute average similarity to each category
+        category_scores: list[tuple[str, float]] = []
+        for category, labels in self._label_embeddings.items():
+            if not labels:
+                continue
+            # Average similarity to all labels in category
+            sims = []
+            for label_embedding in labels.values():
+                sim = float(np.dot(input_normalized, label_embedding))
+                sims.append(sim)
+            avg_sim = np.mean(sims)
+            category_scores.append((category, avg_sim))
+        # Sort categories by average similarity
+        category_scores.sort(key=lambda x: x[1], reverse=True)
+        # Find best label within top categories
+        best_result = (None, None, 0.0)
+        for category, _ in category_scores[:top_k_categories]:
+            for label, label_embedding in self._label_embeddings[category].items():
+                raw_sim = float(np.dot(input_normalized, label_embedding))
+                calibrated_score = self._calibrate_score(raw_sim)
+                if calibrated_score > best_result[2]:
+                    best_result = (category, label, calibrated_score)
+        if best_result[0] and best_result[2] >= min_score:
+            return best_result
+        return (None, None, 0.0)
+class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
+    """
+    Labeler that classifies statements using embedding similarity.
+    Faster than MNLI but may be less accurate for nuanced classification.
+    Good for high-throughput scenarios.
+    """
+    def __init__(
+        self,
+        taxonomy_path: Optional[str | Path] = None,
+        categories: Optional[list[str]] = None,
+        model_name: str = "all-MiniLM-L6-v2",
+        use_hierarchical: bool = True,
+        top_k_categories: int = 3,
+        min_confidence: float = 0.3,
+    ):
+        """
+        Initialize the embedding taxonomy labeler.
+        Args:
+            taxonomy_path: Path to taxonomy JSON file (default: built-in taxonomy)
+            categories: List of categories to use (default: all categories)
+            model_name: sentence-transformers model ID
+            use_hierarchical: Use hierarchical classification for efficiency
+            top_k_categories: Number of top categories to consider in hierarchical mode
+            min_confidence: Minimum confidence threshold for returning a label
+        """
+        self._taxonomy_path = Path(taxonomy_path) if taxonomy_path else DEFAULT_TAXONOMY_PATH
+        self._categories = categories or DEFAULT_CATEGORIES
+        self._model_name = model_name
+        self._use_hierarchical = use_hierarchical
+        self._top_k_categories = top_k_categories
+        self._min_confidence = min_confidence
+        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._classifier: Optional[EmbeddingClassifier] = None
+        self._embeddings_computed = False
+    @property
+    def name(self) -> str:
+        return "embedding_taxonomy_labeler"
+    @property
+    def priority(self) -> int:
+        return 45  # Higher priority than MNLI - default taxonomy labeler (faster)
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.LLM_REQUIRED | PluginCapability.BATCH_PROCESSING
+    @property
+    def description(self) -> str:
+        return "Classifies statements using embedding similarity (faster than MNLI)"
+    @property
+    def label_type(self) -> str:
+        return "taxonomy_embedding"
+    @property
+    def taxonomy_schema(self) -> TaxonomySchema:
+        """Provide taxonomy schema (for documentation/introspection)."""
+        taxonomy = self._load_taxonomy()
+        filtered = {cat: list(labels.keys()) for cat, labels in taxonomy.items() if cat in self._categories}
+        return TaxonomySchema(
+            label_type=self.label_type,
+            values=filtered,
+            description="Statement topic classification using embedding similarity",
+            scope="statement",
+        )
+    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+        """Load taxonomy from JSON file."""
+        if self._taxonomy is not None:
+            return self._taxonomy
+        if not self._taxonomy_path.exists():
+            raise FileNotFoundError(f"Taxonomy file not found: {self._taxonomy_path}")
+        with open(self._taxonomy_path) as f:
+            self._taxonomy = json.load(f)
+        logger.debug(f"Loaded taxonomy with {len(self._taxonomy)} categories")
+        return self._taxonomy
+    def _get_classifier(self) -> EmbeddingClassifier:
+        """Get or create the embedding classifier."""
+        if self._classifier is None:
+            self._classifier = EmbeddingClassifier(model_name=self._model_name)
+        if not self._embeddings_computed:
+            taxonomy = self._load_taxonomy()
+            self._classifier.precompute_label_embeddings(taxonomy, self._categories)
+            self._embeddings_computed = True
+        return self._classifier
+    def label(
+        self,
+        statement: PipelineStatement,
+        subject_canonical: CanonicalEntity,
+        object_canonical: CanonicalEntity,
+        context: PipelineContext,
+    ) -> Optional[StatementLabel]:
+        """
+        Classify statement using embedding similarity.
+        Args:
+            statement: The statement to label
+            subject_canonical: Canonicalized subject
+            object_canonical: Canonicalized object
+            context: Pipeline context
+        Returns:
+            StatementLabel with taxonomy classification, or None if below threshold
+        """
+        # Check for pre-computed classification
+        result = context.get_classification(statement.source_text, self.label_type)
+        if result:
+            label_value, confidence = result
+            if confidence >= self._min_confidence:
+                return StatementLabel(
+                    label_type=self.label_type,
+                    label_value=label_value,
+                    confidence=confidence,
+                    labeler=self.name,
+                )
+            return None
+        # Run embedding classification
+        try:
+            classifier = self._get_classifier()
+            text = statement.source_text
+            if self._use_hierarchical:
+                category, label, confidence = classifier.classify_hierarchical(
+                    text,
+                    top_k_categories=self._top_k_categories,
+                    min_score=self._min_confidence,
+                )
+                if category and label:
+                    full_label = f"{category}:{label}"
+                else:
+                    return None
+            else:
+                results = classifier.classify(
+                    text,
+                    top_k=1,
+                    min_score=self._min_confidence,
+                )
+                if results:
+                    category, label, confidence = results[0]
+                    full_label = f"{category}:{label}"
+                else:
+                    return None
+            # Get the numeric ID for reproducibility
+            label_id = self._get_label_id(category, label)
+            return StatementLabel(
+                label_type=self.label_type,
+                label_value=full_label,
+                confidence=round(confidence, 4),
+                labeler=self.name,
+                metadata={"label_id": label_id, "category": category},
+            )
+        except Exception as e:
+            logger.warning(f"Embedding taxonomy classification failed: {e}")
+        return None
+    def _get_label_id(self, category: str, label: str) -> Optional[int]:
+        """Get the numeric ID for a label."""
+        taxonomy = self._load_taxonomy()
+        if category in taxonomy:
+            return taxonomy[category].get(label)
+        return None
+# Allow importing without decorator for testing
+EmbeddingTaxonomyLabelerClass = EmbeddingTaxonomyLabeler
+EmbeddingClassifierClass = EmbeddingClassifier

statement_extractor/plugins/qualifiers/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""
+Qualifier plugins for Stage 3 (Qualification).
+Adds qualifiers and identifiers to entities.
+"""
+from .base import BaseQualifierPlugin
+from .person import PersonQualifierPlugin
+from .gleif import GLEIFQualifierPlugin
+from .companies_house import CompaniesHouseQualifierPlugin
+from .sec_edgar import SECEdgarQualifierPlugin
+__all__ = [
+    "BaseQualifierPlugin",
+    "PersonQualifierPlugin",
+    "GLEIFQualifierPlugin",
+    "CompaniesHouseQualifierPlugin",
+    "SECEdgarQualifierPlugin",
+]

statement_extractor/plugins/qualifiers/base.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Base class for qualifier plugins.
+Re-exports BaseQualifierPlugin from the main plugins module.
+"""
+from ..base import BaseQualifierPlugin
+__all__ = ["BaseQualifierPlugin"]

statement_extractor/plugins/qualifiers/companies_house.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""
+CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
+Uses the UK Companies House API to:
+- Look up company number by name
+- Retrieve company details, jurisdiction, officers
+"""
+import logging
+import os
+from typing import Optional
+from ..base import BaseQualifierPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import ExtractedEntity, EntityQualifiers, EntityType
+logger = logging.getLogger(__name__)
+# Companies House API base URL
+CH_API_BASE = "https://api.company-information.service.gov.uk"
+@PluginRegistry.qualifier
+class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
+    """
+    Qualifier plugin for UK ORG entities using Companies House API.
+    Requires COMPANIES_HOUSE_API_KEY environment variable.
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        timeout: int = 10,
+        cache_results: bool = True,
+    ):
+        """
+        Initialize the Companies House qualifier.
+        Args:
+            api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
+            timeout: API request timeout in seconds
+            cache_results: Whether to cache API results
+        """
+        self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
+        self._timeout = timeout
+        self._cache_results = cache_results
+        self._cache: dict[str, Optional[dict]] = {}
+    @property
+    def name(self) -> str:
+        return "companies_house_qualifier"
+    @property
+    def priority(self) -> int:
+        return 20  # Run after GLEIF
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
+    @property
+    def description(self) -> str:
+        return "Looks up UK company data from Companies House API"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.ORG}
+    @property
+    def supported_identifier_types(self) -> list[str]:
+        return ["ch_number"]  # Can lookup by company number
+    @property
+    def provided_identifier_types(self) -> list[str]:
+        return ["ch_number"]  # Provides company number
+    def qualify(
+        self,
+        entity: ExtractedEntity,
+        context: PipelineContext,
+    ) -> Optional[EntityQualifiers]:
+        """
+        Qualify an ORG entity with Companies House data.
+        Args:
+            entity: The ORG entity to qualify
+            context: Pipeline context
+        Returns:
+            EntityQualifiers with company number, or None if not found
+        """
+        if entity.type != EntityType.ORG:
+            return None
+        if not self._api_key:
+            logger.debug("Companies House API key not configured")
+            return None
+        # Check cache first
+        cache_key = entity.text.lower().strip()
+        if self._cache_results and cache_key in self._cache:
+            cached = self._cache[cache_key]
+            if cached is None:
+                return None
+            return self._data_to_qualifiers(cached)
+        # Search Companies House API
+        result = self._search_companies_house(entity.text)
+        # Cache result
+        if self._cache_results:
+            self._cache[cache_key] = result
+        if result:
+            return self._data_to_qualifiers(result)
+        return None
+    def _search_companies_house(self, org_name: str) -> Optional[dict]:
+        """Search Companies House API for organization."""
+        try:
+            import requests
+            from requests.auth import HTTPBasicAuth
+            url = f"{CH_API_BASE}/search/companies"
+            params = {"q": org_name, "items_per_page": 5}
+            response = requests.get(
+                url,
+                params=params,
+                auth=HTTPBasicAuth(self._api_key, ""),
+                timeout=self._timeout,
+            )
+            response.raise_for_status()
+            data = response.json()
+            items = data.get("items", [])
+            if items:
+                # Return first match
+                company = items[0]
+                return {
+                    "ch_number": company.get("company_number", ""),
+                    "title": company.get("title", ""),
+                    "company_status": company.get("company_status", ""),
+                    "company_type": company.get("company_type", ""),
+                    "jurisdiction": "UK",
+                    "country": "GB",
+                    "address": company.get("address_snippet", ""),
+                }
+        except ImportError:
+            logger.warning("requests library not available for Companies House API")
+        except Exception as e:
+            logger.debug(f"Companies House API error: {e}")
+        return None
+    def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
+        """Convert Companies House data to EntityQualifiers."""
+        identifiers = {}
+        if data.get("ch_number"):
+            identifiers["ch_number"] = data["ch_number"]
+        return EntityQualifiers(
+            jurisdiction=data.get("jurisdiction"),
+            country=data.get("country"),
+            identifiers=identifiers,
+        )
+# Allow importing without decorator for testing
+CompaniesHouseQualifierPluginClass = CompaniesHouseQualifierPlugin

corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

corp-extractor 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl