PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +10 -1
statement_extractor/cli.py +1663 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +6972 -0
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +89 -0
statement_extractor/models/canonical.py +182 -0
statement_extractor/models/document.py +308 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +220 -0
statement_extractor/models/qualifiers.py +139 -0
statement_extractor/models/statement.py +101 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +129 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +416 -0
statement_extractor/pipeline/registry.py +303 -0
statement_extractor/plugins/__init__.py +55 -0
statement_extractor/plugins/base.py +716 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +546 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +386 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +30 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +185 -0
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +197 -0
statement_extractor/plugins/qualifiers/person.py +785 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +293 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +484 -0
statement_extractor/plugins/taxonomy/mnli.py +291 -0
statement_extractor/scoring.py +8 -8
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/taxonomy/mnli.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""
+MNLITaxonomyClassifier - Classifies statements using MNLI zero-shot classification.
+Uses HuggingFace transformers zero-shot-classification pipeline for taxonomy labeling
+where there are too many possible values for simple multi-choice classification.
+"""
+import json
+import logging
+from pathlib import Path
+from typing import Optional, TypedDict
+from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
+class TaxonomyEntry(TypedDict):
+    """Structure for each taxonomy label entry."""
+    description: str
+    id: int
+    mnli_label: str
+    embedding_label: str
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import (
+    PipelineStatement,
+    CanonicalEntity,
+    TaxonomyResult,
+)
+logger = logging.getLogger(__name__)
+# Default taxonomy file location (relative to this module)
+DEFAULT_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "data" / "statement_taxonomy.json"
+# Default categories to use (all of them)
+DEFAULT_CATEGORIES = [
+    "environment",
+    "society",
+    "governance",
+    "animals",
+    "industry",
+    "human_harm",
+    "human_benefit",
+    "animal_harm",
+    "animal_benefit",
+    "environment_harm",
+    "environment_benefit",
+]
+class MNLIClassifier:
+    """
+    MNLI-based zero-shot classifier for taxonomy labeling.
+    Uses HuggingFace transformers zero-shot-classification pipeline.
+    """
+    def __init__(
+        self,
+        model_id: str = "facebook/bart-large-mnli",
+        device: Optional[str] = None,
+    ):
+        self._model_id = model_id
+        self._device = device
+        self._classifier = None
+    def _load_classifier(self):
+        """Lazy-load the zero-shot classification pipeline."""
+        if self._classifier is not None:
+            return
+        try:
+            from transformers import pipeline
+            import torch
+            device = self._device
+            if device is None:
+                if torch.cuda.is_available():
+                    device = "cuda"
+                elif torch.backends.mps.is_available():
+                    device = "mps"
+                else:
+                    device = "cpu"
+            logger.info(f"Loading MNLI classifier '{self._model_id}' on {device}...")
+            self._classifier = pipeline(
+                "zero-shot-classification",
+                model=self._model_id,
+                device=device if device != "cpu" else -1,
+            )
+            logger.debug("MNLI classifier loaded")
+        except ImportError as e:
+            raise ImportError(
+                "transformers is required for MNLI classification. "
+                "Install with: pip install transformers"
+            ) from e
+    def classify_hierarchical(
+        self,
+        text: str,
+        taxonomy: dict[str, list[str]],
+        top_k_categories: int = 3,
+        min_score: float = 0.3,
+    ) -> list[tuple[str, str, float]]:
+        """
+        Hierarchical classification: first category, then labels within category.
+        Returns all labels above the threshold, not just the best match.
+        Args:
+            text: Text to classify
+            taxonomy: Dict mapping category -> list of labels
+            top_k_categories: Number of top categories to consider
+            min_score: Minimum combined score to include in results
+        Returns:
+            List of (category, label, confidence) tuples above threshold
+        """
+        self._load_classifier()
+        categories = list(taxonomy.keys())
+        cat_result = self._classifier(text, candidate_labels=categories)
+        top_categories = cat_result["labels"][:top_k_categories]
+        top_cat_scores = cat_result["scores"][:top_k_categories]
+        results: list[tuple[str, str, float]] = []
+        for cat, cat_score in zip(top_categories, top_cat_scores):
+            labels = taxonomy[cat]
+            if not labels:
+                continue
+            label_result = self._classifier(text, candidate_labels=labels)
+            # Get all labels above threshold for this category
+            for label, label_score in zip(label_result["labels"], label_result["scores"]):
+                combined_score = cat_score * label_score
+                if combined_score >= min_score:
+                    results.append((cat, label, combined_score))
+        # Sort by confidence descending
+        results.sort(key=lambda x: x[2], reverse=True)
+        return results
+@PluginRegistry.taxonomy
+class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
+    """
+    Taxonomy classifier using MNLI zero-shot classification.
+    Supports hierarchical classification for efficiency with large taxonomies.
+    """
+    def __init__(
+        self,
+        taxonomy_path: Optional[str | Path] = None,
+        categories: Optional[list[str]] = None,
+        model_id: str = "facebook/bart-large-mnli",
+        top_k_categories: int = 3,
+        min_confidence: float = 0.3,
+    ):
+        self._taxonomy_path = Path(taxonomy_path) if taxonomy_path else DEFAULT_TAXONOMY_PATH
+        self._categories = categories or DEFAULT_CATEGORIES
+        self._model_id = model_id
+        self._top_k_categories = top_k_categories
+        self._min_confidence = min_confidence
+        self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
+        self._classifier: Optional[MNLIClassifier] = None
+    @property
+    def name(self) -> str:
+        return "mnli_taxonomy_classifier"
+    @property
+    def priority(self) -> int:
+        return 50  # Lower priority than embedding (use --plugins mnli_taxonomy_classifier to enable)
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.LLM_REQUIRED
+    @property
+    def description(self) -> str:
+        return "Classifies statements against a taxonomy using MNLI zero-shot classification"
+    @property
+    def taxonomy_name(self) -> str:
+        return "esg_topics"
+    @property
+    def taxonomy_schema(self) -> TaxonomySchema:
+        taxonomy = self._load_taxonomy()
+        filtered = {cat: list(labels.keys()) for cat, labels in taxonomy.items() if cat in self._categories}
+        return TaxonomySchema(
+            label_type="taxonomy",
+            values=filtered,
+            description="ESG topic classification taxonomy",
+            scope="statement",
+        )
+    @property
+    def supported_categories(self) -> list[str]:
+        return self._categories.copy()
+    def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
+        """Load taxonomy from JSON file."""
+        if self._taxonomy is not None:
+            return self._taxonomy
+        if not self._taxonomy_path.exists():
+            raise FileNotFoundError(f"Taxonomy file not found: {self._taxonomy_path}")
+        with open(self._taxonomy_path) as f:
+            self._taxonomy = json.load(f)
+        logger.debug(f"Loaded taxonomy with {len(self._taxonomy)} categories")
+        return self._taxonomy
+    def _get_classifier(self) -> MNLIClassifier:
+        if self._classifier is None:
+            self._classifier = MNLIClassifier(model_id=self._model_id)
+        return self._classifier
+    def _get_filtered_taxonomy(self) -> dict[str, list[str]]:
+        taxonomy = self._load_taxonomy()
+        return {
+            cat: list(labels.keys())
+            for cat, labels in taxonomy.items()
+            if cat in self._categories
+        }
+    def classify(
+        self,
+        statement: PipelineStatement,
+        subject_canonical: CanonicalEntity,
+        object_canonical: CanonicalEntity,
+        context: PipelineContext,
+    ) -> list[TaxonomyResult]:
+        """Classify statement against the taxonomy using MNLI.
+        Returns all labels above the confidence threshold.
+        """
+        results: list[TaxonomyResult] = []
+        try:
+            classifier = self._get_classifier()
+            taxonomy = self._get_filtered_taxonomy()
+            text = statement.source_text
+            classifications = classifier.classify_hierarchical(
+                text,
+                taxonomy,
+                top_k_categories=self._top_k_categories,
+                min_score=self._min_confidence,
+            )
+            for category, label, confidence in classifications:
+                label_id = self._get_label_id(category, label)
+                results.append(TaxonomyResult(
+                    taxonomy_name=self.taxonomy_name,
+                    category=category,
+                    label=label,
+                    label_id=label_id,
+                    confidence=round(confidence, 4),
+                    classifier=self.name,
+                ))
+        except Exception as e:
+            logger.warning(f"MNLI taxonomy classification failed: {e}")
+        return results
+    def _get_label_id(self, category: str, label: str) -> Optional[int]:
+        taxonomy = self._load_taxonomy()
+        if category in taxonomy:
+            entry = taxonomy[category].get(label)
+            if entry:
+                return entry.get("id")
+        return None
+# For testing without decorator
+MNLITaxonomyClassifierClass = MNLITaxonomyClassifier

statement_extractor/scoring.py CHANGED Viewed

@@ -409,18 +409,18 @@ class BeamScorer:
         filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
         logger.debug(f"  After confidence filter (>={min_conf}): {len(filtered)} statements")
-        # # Filter out statements where source_text doesn't support the predicate
-        # # This catches model hallucinations where predicate doesn't match the evidence
-        # consistent = [
-        #     s for s in filtered
-        #     if self._source_text_supports_predicate(s)
-        # ]
-        # logger.debug(f"  After predicate consistency filter: {len(consistent)} statements")
+        # Filter out statements where source_text doesn't support the predicate
+        # This catches model hallucinations where predicate doesn't match the evidence
+        consistent = [
+            s for s in filtered
+            if self._source_text_supports_predicate(s)
+        ]
+        logger.debug(f"  After predicate consistency filter: {len(consistent)} statements")
         # Deduplicate - keep highest confidence for each (subject, predicate, object)
         # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
         seen: dict[tuple[str, str, str], Statement] = {}
-        for stmt in all_statements:
+        for stmt in consistent:
             key = (
                 stmt.subject.text.lower(),
                 stmt.predicate.lower(),

corp_extractor-0.4.0.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-statement_extractor/__init__.py,sha256=KwZfWnTB9oevTLw0TrNlYFu67qIYO-34JqDtcpjOhZI,3013
-statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
-statement_extractor/cli.py,sha256=FOkkihVfoROc-Biu8ICCzlLJeDScYYNLLJHnv0GCGGM,9507
-statement_extractor/extractor.py,sha256=d0HnCeCPybw-4jDxH_ffZ4LY9Klvqnza_wa90Bd4Q18,40074
-statement_extractor/gliner_extraction.py,sha256=KNs3n5-fnoUwY1wvbPwZL8j-3YVstmioJlcjp2k1FmY,10491
-statement_extractor/models.py,sha256=cyCQc3vlYB3qlg6-uL5Vt4odIiulKtHzz1Cyrf0lEAU,12198
-statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
-statement_extractor/scoring.py,sha256=s_8nhavBNzPPFmGf2FyBummH4tgP7YGpXoMhl2Jh3Xw,16650
-corp_extractor-0.4.0.dist-info/METADATA,sha256=8f2CDtZG757kaB6XMfbBVdNSRMyS5-4Lflc_LoZCC_8,17725
-corp_extractor-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-corp_extractor-0.4.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
-corp_extractor-0.4.0.dist-info/RECORD,,

{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl