PyPI - corp-extractor - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

corp-extractor 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
corp_extractor-0.5.0.dist-info/RECORD +55 -0
statement_extractor/__init__.py +9 -0
statement_extractor/cli.py +460 -21
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +1182 -0
statement_extractor/extractor.py +32 -47
statement_extractor/gliner_extraction.py +218 -0
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +74 -0
statement_extractor/models/canonical.py +139 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +191 -0
statement_extractor/models/qualifiers.py +91 -0
statement_extractor/models/statement.py +75 -0
statement_extractor/models.py +15 -6
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +134 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +447 -0
statement_extractor/pipeline/registry.py +297 -0
statement_extractor/plugins/__init__.py +43 -0
statement_extractor/plugins/base.py +446 -0
statement_extractor/plugins/canonicalizers/__init__.py +17 -0
statement_extractor/plugins/canonicalizers/base.py +9 -0
statement_extractor/plugins/canonicalizers/location.py +219 -0
statement_extractor/plugins/canonicalizers/organization.py +230 -0
statement_extractor/plugins/canonicalizers/person.py +242 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +536 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +373 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
statement_extractor/plugins/qualifiers/__init__.py +19 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +174 -0
statement_extractor/plugins/qualifiers/gleif.py +186 -0
statement_extractor/plugins/qualifiers/person.py +221 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +188 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +337 -0
statement_extractor/plugins/taxonomy/mnli.py +279 -0
statement_extractor/scoring.py +17 -69
corp_extractor-0.3.0.dist-info/RECORD +0 -12
statement_extractor/spacy_extraction.py +0 -386
{corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/taxonomy/mnli.py ADDED Viewed

@@ -0,0 +1,279 @@
+"""
+MNLITaxonomyClassifier - Classifies statements using MNLI zero-shot classification.
+Uses HuggingFace transformers zero-shot-classification pipeline for taxonomy labeling
+where there are too many possible values for simple multi-choice classification.
+"""
+import json
+import logging
+from pathlib import Path
+from typing import Optional
+from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import (
+    PipelineStatement,
+    CanonicalEntity,
+    TaxonomyResult,
+)
+logger = logging.getLogger(__name__)
+# Default taxonomy file location (relative to this module)
+DEFAULT_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "data" / "statement_taxonomy.json"
+# Default categories to use (all of them)
+DEFAULT_CATEGORIES = [
+    "environment",
+    "society",
+    "governance",
+    "animals",
+    "industry",
+    "human_harm",
+    "human_benefit",
+    "animal_harm",
+    "animal_benefit",
+    "environment_harm",
+    "environment_benefit",
+]
+class MNLIClassifier:
+    """
+    MNLI-based zero-shot classifier for taxonomy labeling.
+    Uses HuggingFace transformers zero-shot-classification pipeline.
+    """
+    def __init__(
+        self,
+        model_id: str = "facebook/bart-large-mnli",
+        device: Optional[str] = None,
+    ):
+        self._model_id = model_id
+        self._device = device
+        self._classifier = None
+    def _load_classifier(self):
+        """Lazy-load the zero-shot classification pipeline."""
+        if self._classifier is not None:
+            return
+        try:
+            from transformers import pipeline
+            import torch
+            device = self._device
+            if device is None:
+                if torch.cuda.is_available():
+                    device = "cuda"
+                elif torch.backends.mps.is_available():
+                    device = "mps"
+                else:
+                    device = "cpu"
+            logger.info(f"Loading MNLI classifier '{self._model_id}' on {device}...")
+            self._classifier = pipeline(
+                "zero-shot-classification",
+                model=self._model_id,
+                device=device if device != "cpu" else -1,
+            )
+            logger.debug("MNLI classifier loaded")
+        except ImportError as e:
+            raise ImportError(
+                "transformers is required for MNLI classification. "
+                "Install with: pip install transformers"
+            ) from e
+    def classify_hierarchical(
+        self,
+        text: str,
+        taxonomy: dict[str, list[str]],
+        top_k_categories: int = 3,
+        min_score: float = 0.3,
+    ) -> list[tuple[str, str, float]]:
+        """
+        Hierarchical classification: first category, then labels within category.
+        Returns all labels above the threshold, not just the best match.
+        Args:
+            text: Text to classify
+            taxonomy: Dict mapping category -> list of labels
+            top_k_categories: Number of top categories to consider
+            min_score: Minimum combined score to include in results
+        Returns:
+            List of (category, label, confidence) tuples above threshold
+        """
+        self._load_classifier()
+        categories = list(taxonomy.keys())
+        cat_result = self._classifier(text, candidate_labels=categories)
+        top_categories = cat_result["labels"][:top_k_categories]
+        top_cat_scores = cat_result["scores"][:top_k_categories]
+        results: list[tuple[str, str, float]] = []
+        for cat, cat_score in zip(top_categories, top_cat_scores):
+            labels = taxonomy[cat]
+            if not labels:
+                continue
+            label_result = self._classifier(text, candidate_labels=labels)
+            # Get all labels above threshold for this category
+            for label, label_score in zip(label_result["labels"], label_result["scores"]):
+                combined_score = cat_score * label_score
+                if combined_score >= min_score:
+                    results.append((cat, label, combined_score))
+        # Sort by confidence descending
+        results.sort(key=lambda x: x[2], reverse=True)
+        return results
+@PluginRegistry.taxonomy
+class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
+    """
+    Taxonomy classifier using MNLI zero-shot classification.
+    Supports hierarchical classification for efficiency with large taxonomies.
+    """
+    def __init__(
+        self,
+        taxonomy_path: Optional[str | Path] = None,
+        categories: Optional[list[str]] = None,
+        model_id: str = "facebook/bart-large-mnli",
+        top_k_categories: int = 3,
+        min_confidence: float = 0.3,
+    ):
+        self._taxonomy_path = Path(taxonomy_path) if taxonomy_path else DEFAULT_TAXONOMY_PATH
+        self._categories = categories or DEFAULT_CATEGORIES
+        self._model_id = model_id
+        self._top_k_categories = top_k_categories
+        self._min_confidence = min_confidence
+        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._classifier: Optional[MNLIClassifier] = None
+    @property
+    def name(self) -> str:
+        return "mnli_taxonomy_classifier"
+    @property
+    def priority(self) -> int:
+        return 50  # Lower priority than embedding (use --plugins mnli_taxonomy_classifier to enable)
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.LLM_REQUIRED
+    @property
+    def description(self) -> str:
+        return "Classifies statements against a taxonomy using MNLI zero-shot classification"
+    @property
+    def taxonomy_name(self) -> str:
+        return "esg_topics"
+    @property
+    def taxonomy_schema(self) -> TaxonomySchema:
+        taxonomy = self._load_taxonomy()
+        filtered = {cat: list(labels.keys()) for cat, labels in taxonomy.items() if cat in self._categories}
+        return TaxonomySchema(
+            label_type="taxonomy",
+            values=filtered,
+            description="ESG topic classification taxonomy",
+            scope="statement",
+        )
+    @property
+    def supported_categories(self) -> list[str]:
+        return self._categories.copy()
+    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+        """Load taxonomy from JSON file."""
+        if self._taxonomy is not None:
+            return self._taxonomy
+        if not self._taxonomy_path.exists():
+            raise FileNotFoundError(f"Taxonomy file not found: {self._taxonomy_path}")
+        with open(self._taxonomy_path) as f:
+            self._taxonomy = json.load(f)
+        logger.debug(f"Loaded taxonomy with {len(self._taxonomy)} categories")
+        return self._taxonomy
+    def _get_classifier(self) -> MNLIClassifier:
+        if self._classifier is None:
+            self._classifier = MNLIClassifier(model_id=self._model_id)
+        return self._classifier
+    def _get_filtered_taxonomy(self) -> dict[str, list[str]]:
+        taxonomy = self._load_taxonomy()
+        return {
+            cat: list(labels.keys())
+            for cat, labels in taxonomy.items()
+            if cat in self._categories
+        }
+    def classify(
+        self,
+        statement: PipelineStatement,
+        subject_canonical: CanonicalEntity,
+        object_canonical: CanonicalEntity,
+        context: PipelineContext,
+    ) -> list[TaxonomyResult]:
+        """Classify statement against the taxonomy using MNLI.
+        Returns all labels above the confidence threshold.
+        """
+        results: list[TaxonomyResult] = []
+        try:
+            classifier = self._get_classifier()
+            taxonomy = self._get_filtered_taxonomy()
+            text = statement.source_text
+            classifications = classifier.classify_hierarchical(
+                text,
+                taxonomy,
+                top_k_categories=self._top_k_categories,
+                min_score=self._min_confidence,
+            )
+            for category, label, confidence in classifications:
+                label_id = self._get_label_id(category, label)
+                results.append(TaxonomyResult(
+                    taxonomy_name=self.taxonomy_name,
+                    category=category,
+                    label=label,
+                    label_id=label_id,
+                    confidence=round(confidence, 4),
+                    classifier=self.name,
+                ))
+        except Exception as e:
+            logger.warning(f"MNLI taxonomy classification failed: {e}")
+        return results
+    def _get_label_id(self, category: str, label: str) -> Optional[int]:
+        taxonomy = self._load_taxonomy()
+        if category in taxonomy:
+            return taxonomy[category].get(label)
+        return None
+# For testing without decorator
+MNLITaxonomyClassifierClass = MNLITaxonomyClassifier

statement_extractor/scoring.py CHANGED Viewed

@@ -15,41 +15,21 @@ from .models import ScoringConfig, Statement
 logger = logging.getLogger(__name__)
-# Lazy-loaded spaCy model for grammatical analysis
-_nlp = None
-def _get_nlp():
-    """Lazy-load spaCy model for POS tagging."""
-    global _nlp
-    if _nlp is None:
-        import spacy
-        try:
-            _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
-        except OSError:
-            # Model not found, try to download
-            from .spacy_extraction import _download_model
-            if _download_model():
-                _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
-            else:
-                raise
-    return _nlp
 class TripleScorer:
     """
-    Score individual triples combining semantic similarity and grammatical accuracy.
+    Score individual triples combining semantic similarity and entity recognition.
     The score is a weighted combination of:
     - Semantic similarity (50%): Cosine similarity between source text and reassembled triple
-    - Subject noun score (25%): How noun-like the subject is
-    - Object noun score (25%): How noun-like the object is
-    Noun scoring:
-    - Proper noun only (PROPN): 1.0
-    - Common noun only (NOUN): 0.8
-    - Contains noun + other words: 0.6
-    - No noun: 0.2
+    - Subject entity score (25%): How entity-like the subject is (via GLiNER2)
+    - Object entity score (25%): How entity-like the object is (via GLiNER2)
+    Entity scoring (via GLiNER2):
+    - Recognized entity with high confidence: 1.0
+    - Recognized entity with moderate confidence: 0.8
+    - Partially recognized: 0.6
+    - Not recognized: 0.2
     """
     def __init__(
@@ -102,54 +82,22 @@ class TripleScorer:
     def _score_noun_content(self, text: str) -> float:
         """
-        Score how noun-like a text is.
+        Score how entity-like a text is using GLiNER2 entity recognition.
         Returns:
-            1.0 - Entirely proper noun(s)
-            0.8 - Entirely common noun(s)
-            0.6 - Contains noun(s) but also other words
-            0.2 - No nouns found
+            1.0 - Recognized as a named entity with high confidence
+            0.8 - Recognized as an entity with moderate confidence
+            0.6 - Partially recognized or contains entity-like content
+            0.2 - Not recognized as any entity type
         """
         if not text or not text.strip():
             return 0.2
         try:
-            nlp = _get_nlp()
-            doc = nlp(text)
-            # Count token types (excluding punctuation and spaces)
-            tokens = [t for t in doc if not t.is_punct and not t.is_space]
-            if not tokens:
-                return 0.2
-            proper_nouns = sum(1 for t in tokens if t.pos_ == "PROPN")
-            common_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
-            total_nouns = proper_nouns + common_nouns
-            total_tokens = len(tokens)
-            if total_nouns == 0:
-                # No nouns at all
-                return 0.2
-            if total_nouns == total_tokens:
-                # Entirely nouns
-                if proper_nouns == total_tokens:
-                    # All proper nouns
-                    return 1.0
-                elif common_nouns == total_tokens:
-                    # All common nouns
-                    return 0.8
-                else:
-                    # Mix of proper and common nouns
-                    return 0.9
-            # Contains nouns but also other words
-            # Score based on noun ratio
-            noun_ratio = total_nouns / total_tokens
-            return 0.4 + (noun_ratio * 0.4)  # Range: 0.4 to 0.8
+            from .gliner_extraction import score_entity_content
+            return score_entity_content(text)
         except Exception as e:
-            logger.debug(f"Noun scoring failed for '{text}': {e}")
+            logger.debug(f"Entity scoring failed for '{text}': {e}")
             return 0.5  # Neutral score on error
     def score_triple(self, statement: Statement, source_text: str) -> float:

corp_extractor-0.3.0.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-statement_extractor/__init__.py,sha256=KwZfWnTB9oevTLw0TrNlYFu67qIYO-34JqDtcpjOhZI,3013
-statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
-statement_extractor/cli.py,sha256=JMEXiT2xwmW1J8JmJliQh32AT-7bTAtAscPx1AGRfPg,9054
-statement_extractor/extractor.py,sha256=vS8UCgE8uITt_28PwCh4WCqOjWLpfrJcN3fh1YPBcjA,39657
-statement_extractor/models.py,sha256=FxLj2fIodX317XVIJLZ0GFNahm_VV07KzdoLSSjoVD4,11952
-statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
-statement_extractor/scoring.py,sha256=pdNgyLHmlk-npISzm4nycK9G4wM2nztg5KTG7piFACI,18135
-statement_extractor/spacy_extraction.py,sha256=ACvIB-Ag7H7h_Gb0cdypIr8fnf3A-UjyJnqqjWD5Ccs,12320
-corp_extractor-0.3.0.dist-info/METADATA,sha256=eu8b7R_FQxFyc_9FSocy078TTyB7BwvGX-YAS79hKgg,17042
-corp_extractor-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-corp_extractor-0.3.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
-corp_extractor-0.3.0.dist-info/RECORD,,

corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

corp-extractor 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl