PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
corp_extractor-0.5.0.dist-info/RECORD +55 -0
statement_extractor/__init__.py +9 -0
statement_extractor/cli.py +446 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +1182 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +74 -0
statement_extractor/models/canonical.py +139 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +191 -0
statement_extractor/models/qualifiers.py +91 -0
statement_extractor/models/statement.py +75 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +134 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +447 -0
statement_extractor/pipeline/registry.py +297 -0
statement_extractor/plugins/__init__.py +43 -0
statement_extractor/plugins/base.py +446 -0
statement_extractor/plugins/canonicalizers/__init__.py +17 -0
statement_extractor/plugins/canonicalizers/base.py +9 -0
statement_extractor/plugins/canonicalizers/location.py +219 -0
statement_extractor/plugins/canonicalizers/organization.py +230 -0
statement_extractor/plugins/canonicalizers/person.py +242 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +536 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +373 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
statement_extractor/plugins/qualifiers/__init__.py +19 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +174 -0
statement_extractor/plugins/qualifiers/gleif.py +186 -0
statement_extractor/plugins/qualifiers/person.py +221 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +188 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +337 -0
statement_extractor/plugins/taxonomy/mnli.py +279 -0
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/splitters/t5_gemma.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""
+T5GemmaSplitter - Stage 1 plugin that wraps the existing StatementExtractor.
+Uses T5-Gemma2 model with Diverse Beam Search to generate high-quality
+subject-predicate-object triples from text.
+"""
+import logging
+import re
+import xml.etree.ElementTree as ET
+from typing import Optional
+from ..base import BaseSplitterPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import RawTriple
+logger = logging.getLogger(__name__)
+@PluginRegistry.splitter
+class T5GemmaSplitter(BaseSplitterPlugin):
+    """
+    Splitter plugin that uses T5-Gemma2 for triple extraction.
+    Wraps the existing StatementExtractor from extractor.py to produce
+    RawTriple objects for the pipeline.
+    """
+    def __init__(
+        self,
+        model_id: Optional[str] = None,
+        device: Optional[str] = None,
+        num_beams: int = 4,
+        diversity_penalty: float = 1.0,
+        max_new_tokens: int = 2048,
+    ):
+        """
+        Initialize the T5Gemma splitter.
+        Args:
+            model_id: HuggingFace model ID (defaults to Corp-o-Rate model)
+            device: Device to use (auto-detected if not specified)
+            num_beams: Number of beams for diverse beam search
+            diversity_penalty: Penalty for beam diversity
+            max_new_tokens: Maximum tokens to generate
+        """
+        self._model_id = model_id
+        self._device = device
+        self._num_beams = num_beams
+        self._diversity_penalty = diversity_penalty
+        self._max_new_tokens = max_new_tokens
+        self._extractor = None
+    @property
+    def name(self) -> str:
+        return "t5_gemma_splitter"
+    @property
+    def priority(self) -> int:
+        return 10  # High priority - primary splitter
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.LLM_REQUIRED
+    @property
+    def description(self) -> str:
+        return "T5-Gemma2 model for extracting triples using Diverse Beam Search"
+    def _get_extractor(self):
+        """Lazy-load the StatementExtractor."""
+        if self._extractor is None:
+            from ...extractor import StatementExtractor
+            # Only pass model_id and device if they were explicitly set
+            kwargs = {}
+            if self._model_id is not None:
+                kwargs["model_id"] = self._model_id
+            if self._device is not None:
+                kwargs["device"] = self._device
+            self._extractor = StatementExtractor(**kwargs)
+        return self._extractor
+    def split(
+        self,
+        text: str,
+        context: PipelineContext,
+    ) -> list[RawTriple]:
+        """
+        Split text into raw triples using T5-Gemma2.
+        Args:
+            text: Input text to split
+            context: Pipeline context
+        Returns:
+            List of RawTriple objects
+        """
+        logger.debug(f"T5GemmaSplitter processing {len(text)} chars")
+        # Get options from context if available
+        splitter_options = context.source_metadata.get("splitter_options", {})
+        num_beams = splitter_options.get("num_beams", self._num_beams)
+        diversity_penalty = splitter_options.get("diversity_penalty", self._diversity_penalty)
+        max_new_tokens = splitter_options.get("max_new_tokens", self._max_new_tokens)
+        # Create extraction options
+        from ...models import ExtractionOptions as LegacyExtractionOptions
+        options = LegacyExtractionOptions(
+            num_beams=num_beams,
+            diversity_penalty=diversity_penalty,
+            max_new_tokens=max_new_tokens,
+            # Disable GLiNER and dedup - we handle those in later stages
+            use_gliner_extraction=False,
+            embedding_dedup=False,
+            deduplicate=False,
+        )
+        # Get raw XML from extractor
+        extractor = self._get_extractor()
+        xml_output = extractor.extract_as_xml(text, options)
+        # Parse XML to RawTriple objects
+        raw_triples = self._parse_xml_to_raw_triples(xml_output)
+        logger.info(f"T5GemmaSplitter produced {len(raw_triples)} raw triples")
+        return raw_triples
+    def _parse_xml_to_raw_triples(self, xml_output: str) -> list[RawTriple]:
+        """Parse XML output into RawTriple objects."""
+        raw_triples = []
+        try:
+            root = ET.fromstring(xml_output)
+        except ET.ParseError as e:
+            logger.warning(f"XML parse error: {e}")
+            # Try to repair
+            xml_output = self._repair_xml(xml_output)
+            try:
+                root = ET.fromstring(xml_output)
+            except ET.ParseError:
+                logger.error("XML repair failed")
+                return raw_triples
+        if root.tag != "statements":
+            logger.warning(f"Unexpected root tag: {root.tag}")
+            return raw_triples
+        for stmt_elem in root.findall("stmt"):
+            try:
+                subject_elem = stmt_elem.find("subject")
+                predicate_elem = stmt_elem.find("predicate")
+                object_elem = stmt_elem.find("object")
+                text_elem = stmt_elem.find("text")
+                subject_text = subject_elem.text.strip() if subject_elem is not None and subject_elem.text else ""
+                predicate_text = predicate_elem.text.strip() if predicate_elem is not None and predicate_elem.text else ""
+                object_text = object_elem.text.strip() if object_elem is not None and object_elem.text else ""
+                source_text = text_elem.text.strip() if text_elem is not None and text_elem.text else ""
+                if subject_text and object_text and source_text:
+                    raw_triples.append(RawTriple(
+                        subject_text=subject_text,
+                        predicate_text=predicate_text,
+                        object_text=object_text,
+                        source_sentence=source_text,
+                    ))
+                else:
+                    logger.debug(f"Skipping incomplete triple: s={subject_text}, p={predicate_text}, o={object_text}")
+            except Exception as e:
+                logger.warning(f"Error parsing stmt element: {e}")
+                continue
+        return raw_triples
+    def _repair_xml(self, xml_string: str) -> str:
+        """Attempt to repair common XML syntax errors."""
+        # Use the repair function from extractor.py
+        from ...extractor import repair_xml
+        repaired, repairs = repair_xml(xml_string)
+        if repairs:
+            logger.debug(f"XML repairs: {', '.join(repairs)}")
+        return repaired
+# Allow importing without decorator for testing
+T5GemmaSplitterClass = T5GemmaSplitter

statement_extractor/plugins/taxonomy/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+Taxonomy classifier plugins for Stage 6 (Taxonomy).
+Classifies statements against large taxonomies using MNLI or embeddings.
+"""
+from .mnli import MNLITaxonomyClassifier
+from .embedding import EmbeddingTaxonomyClassifier
+__all__ = [
+    "MNLITaxonomyClassifier",
+    "EmbeddingTaxonomyClassifier",
+]

statement_extractor/plugins/taxonomy/embedding.py ADDED Viewed

@@ -0,0 +1,337 @@
+"""
+EmbeddingTaxonomyClassifier - Classifies statements using embedding similarity.
+Uses sentence-transformers to embed text and compare to pre-computed label
+embeddings using cosine similarity with sigmoid calibration.
+Faster than MNLI but may be less accurate for nuanced classification.
+"""
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Optional
+import numpy as np
+from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import (
+    PipelineStatement,
+    CanonicalEntity,
+    TaxonomyResult,
+)
+logger = logging.getLogger(__name__)
+# Default taxonomy file location
+DEFAULT_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "data" / "statement_taxonomy.json"
+# Default categories
+DEFAULT_CATEGORIES = [
+    "environment",
+    "society",
+    "governance",
+    "animals",
+    "industry",
+    "human_harm",
+    "human_benefit",
+    "animal_harm",
+    "animal_benefit",
+    "environment_harm",
+    "environment_benefit",
+]
+class EmbeddingClassifier:
+    """
+    Embedding-based classifier using cosine similarity.
+    Pre-computes embeddings for all labels and uses dot product
+    (on normalized vectors) for fast classification.
+    """
+    SIMILARITY_THRESHOLD = 0.65
+    CALIBRATION_STEEPNESS = 25.0
+    def __init__(
+        self,
+        model_name: str = "google/embeddinggemma-300m",
+        device: Optional[str] = None,
+    ):
+        self._model_name = model_name
+        self._device = device
+        self._model = None
+        self._label_embeddings: dict[str, dict[str, np.ndarray]] = {}
+        self._text_embedding_cache: dict[str, np.ndarray] = {}  # Cache for input text embeddings
+    def _load_model(self):
+        if self._model is not None:
+            return
+        try:
+            from sentence_transformers import SentenceTransformer
+            import torch
+            device = self._device
+            if device is None:
+                if torch.cuda.is_available():
+                    device = "cuda"
+                elif torch.backends.mps.is_available():
+                    device = "mps"
+                else:
+                    device = "cpu"
+            logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
+            self._model = SentenceTransformer(self._model_name, device=device)
+            logger.debug("Embedding model loaded")
+        except ImportError as e:
+            raise ImportError(
+                "sentence-transformers is required for embedding classification. "
+                "Install with: pip install sentence-transformers"
+            ) from e
+    def precompute_label_embeddings(
+        self,
+        taxonomy: dict[str, dict[str, int]],
+        categories: Optional[list[str]] = None,
+    ) -> None:
+        """Pre-compute embeddings for all label names."""
+        self._load_model()
+        start_time = time.perf_counter()
+        total_labels = 0
+        categories_to_process = categories or list(taxonomy.keys())
+        for category in categories_to_process:
+            if category not in taxonomy:
+                continue
+            labels = taxonomy[category]
+            label_names = list(labels.keys())
+            if not label_names:
+                continue
+            embeddings = self._model.encode(label_names, convert_to_numpy=True, show_progress_bar=False)
+            self._label_embeddings[category] = {}
+            for label_name, embedding in zip(label_names, embeddings):
+                norm = np.linalg.norm(embedding)
+                normalized = embedding / (norm + 1e-8)
+                self._label_embeddings[category][label_name] = normalized.astype(np.float32)
+                total_labels += 1
+        elapsed = time.perf_counter() - start_time
+        logger.info(
+            f"Pre-computed embeddings for {total_labels} labels "
+            f"across {len(self._label_embeddings)} categories in {elapsed:.2f}s"
+        )
+    def _calibrate_score(self, raw_similarity: float) -> float:
+        normalized = (raw_similarity + 1) / 2
+        exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
+        return 1.0 / (1.0 + np.exp(exponent))
+    def classify_hierarchical(
+        self,
+        text: str,
+        top_k_categories: int = 3,
+        min_score: float = 0.3,
+    ) -> list[tuple[str, str, float]]:
+        """Hierarchical classification: find categories, then all labels above threshold.
+        Returns all labels above the threshold, not just the best match.
+        Args:
+            text: Text to classify
+            top_k_categories: Number of top categories to consider
+            min_score: Minimum calibrated score to include in results
+        Returns:
+            List of (category, label, confidence) tuples above threshold
+        """
+        self._load_model()
+        if not self._label_embeddings:
+            raise RuntimeError("Label embeddings not pre-computed.")
+        # Check cache for input text embedding
+        if text in self._text_embedding_cache:
+            input_normalized = self._text_embedding_cache[text]
+        else:
+            input_embedding = self._model.encode(text, convert_to_numpy=True, show_progress_bar=False)
+            input_norm = np.linalg.norm(input_embedding)
+            input_normalized = (input_embedding / (input_norm + 1e-8)).astype(np.float32)
+            self._text_embedding_cache[text] = input_normalized
+            logger.debug(f"Cached embedding for text: '{text[:50]}...' (cache size: {len(self._text_embedding_cache)})")
+        # Compute average similarity to each category
+        category_scores: list[tuple[str, float]] = []
+        for category, labels in self._label_embeddings.items():
+            if not labels:
+                continue
+            sims = []
+            for label_embedding in labels.values():
+                sim = float(np.dot(input_normalized, label_embedding))
+                sims.append(sim)
+            avg_sim = np.mean(sims)
+            category_scores.append((category, avg_sim))
+        category_scores.sort(key=lambda x: x[1], reverse=True)
+        results: list[tuple[str, str, float]] = []
+        for category, _ in category_scores[:top_k_categories]:
+            for label, label_embedding in self._label_embeddings[category].items():
+                raw_sim = float(np.dot(input_normalized, label_embedding))
+                calibrated_score = self._calibrate_score(raw_sim)
+                if calibrated_score >= min_score:
+                    results.append((category, label, calibrated_score))
+        # Sort by confidence descending
+        results.sort(key=lambda x: x[2], reverse=True)
+        return results
+@PluginRegistry.taxonomy
+class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
+    """
+    Taxonomy classifier using embedding similarity.
+    Faster than MNLI, good for high-throughput scenarios.
+    """
+    def __init__(
+        self,
+        taxonomy_path: Optional[str | Path] = None,
+        categories: Optional[list[str]] = None,
+        model_name: str = "google/embeddinggemma-300m",
+        top_k_categories: int = 3,
+        min_confidence: float = 0.8,
+    ):
+        self._taxonomy_path = Path(taxonomy_path) if taxonomy_path else DEFAULT_TAXONOMY_PATH
+        self._categories = categories or DEFAULT_CATEGORIES
+        self._model_name = model_name
+        self._top_k_categories = top_k_categories
+        self._min_confidence = min_confidence
+        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._classifier: Optional[EmbeddingClassifier] = None
+        self._embeddings_computed = False
+    @property
+    def name(self) -> str:
+        return "embedding_taxonomy_classifier"
+    @property
+    def priority(self) -> int:
+        return 10  # High priority - default taxonomy classifier (faster than MNLI)
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.LLM_REQUIRED | PluginCapability.BATCH_PROCESSING
+    @property
+    def description(self) -> str:
+        return "Classifies statements using embedding similarity (faster than MNLI)"
+    @property
+    def taxonomy_name(self) -> str:
+        return "esg_topics_embedding"
+    @property
+    def taxonomy_schema(self) -> TaxonomySchema:
+        taxonomy = self._load_taxonomy()
+        filtered = {cat: list(labels.keys()) for cat, labels in taxonomy.items() if cat in self._categories}
+        return TaxonomySchema(
+            label_type="taxonomy",
+            values=filtered,
+            description="ESG topic classification using embeddings",
+            scope="statement",
+        )
+    @property
+    def supported_categories(self) -> list[str]:
+        return self._categories.copy()
+    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+        if self._taxonomy is not None:
+            return self._taxonomy
+        if not self._taxonomy_path.exists():
+            raise FileNotFoundError(f"Taxonomy file not found: {self._taxonomy_path}")
+        with open(self._taxonomy_path) as f:
+            self._taxonomy = json.load(f)
+        logger.debug(f"Loaded taxonomy with {len(self._taxonomy)} categories")
+        return self._taxonomy
+    def _get_classifier(self) -> EmbeddingClassifier:
+        if self._classifier is None:
+            self._classifier = EmbeddingClassifier(model_name=self._model_name)
+        if not self._embeddings_computed:
+            taxonomy = self._load_taxonomy()
+            self._classifier.precompute_label_embeddings(taxonomy, self._categories)
+            self._embeddings_computed = True
+        return self._classifier
+    def classify(
+        self,
+        statement: PipelineStatement,
+        subject_canonical: CanonicalEntity,
+        object_canonical: CanonicalEntity,
+        context: PipelineContext,
+    ) -> list[TaxonomyResult]:
+        """Classify statement using embedding similarity.
+        Returns all labels above the confidence threshold.
+        """
+        results: list[TaxonomyResult] = []
+        try:
+            classifier = self._get_classifier()
+            text = statement.source_text
+            classifications = classifier.classify_hierarchical(
+                text,
+                top_k_categories=self._top_k_categories,
+                min_score=self._min_confidence,
+            )
+            for category, label, confidence in classifications:
+                label_id = self._get_label_id(category, label)
+                results.append(TaxonomyResult(
+                    taxonomy_name=self.taxonomy_name,
+                    category=category,
+                    label=label,
+                    label_id=label_id,
+                    confidence=round(confidence, 4),
+                    classifier=self.name,
+                ))
+        except Exception as e:
+            logger.warning(f"Embedding taxonomy classification failed: {e}")
+        return results
+    def _get_label_id(self, category: str, label: str) -> Optional[int]:
+        taxonomy = self._load_taxonomy()
+        if category in taxonomy:
+            return taxonomy[category].get(label)
+        return None
+# For testing without decorator
+EmbeddingTaxonomyClassifierClass = EmbeddingTaxonomyClassifier

corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl