PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
corp_extractor-0.9.3.dist-info/RECORD +79 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +2030 -24
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +428 -0
statement_extractor/database/importers/__init__.py +32 -0
statement_extractor/database/importers/companies_house.py +559 -0
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +561 -0
statement_extractor/database/importers/sec_edgar.py +392 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +1120 -0
statement_extractor/database/importers/wikidata_dump.py +1951 -0
statement_extractor/database/importers/wikidata_people.py +1130 -0
statement_extractor/database/models.py +254 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +3034 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +171 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +19 -3
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +39 -15
statement_extractor/models.py +1 -1
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +90 -121
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +348 -78
statement_extractor/plugins/extractors/gliner2.py +38 -28
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +588 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +176 -75
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/taxonomy/embedding.py CHANGED Viewed

@@ -11,10 +11,18 @@ import json
 import logging
 import time
 from pathlib import Path
-from typing import Optional
+from typing import Optional, TypedDict
 import numpy as np
+class TaxonomyEntry(TypedDict):
+    """Structure for each taxonomy label entry."""
+    description: str
+    id: int
+    mnli_label: str
+    embedding_label: str
 from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
 from ...pipeline.context import PipelineContext
 from ...pipeline.registry import PluginRegistry
@@ -96,7 +104,7 @@ class EmbeddingClassifier:
     def precompute_label_embeddings(
         self,
-        taxonomy: dict[str, dict[str, int]],
+        taxonomy: dict[str, dict[str, TaxonomyEntry]],
         categories: Optional[list[str]] = None,
     ) -> None:
         """Pre-compute embeddings for all label names."""
@@ -137,68 +145,127 @@ class EmbeddingClassifier:
         exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
         return 1.0 / (1.0 + np.exp(exponent))
-    def classify_hierarchical(
+    def encode_batch(self, texts: list[str]) -> np.ndarray:
+        """
+        Encode multiple texts into normalized embeddings in a single batch.
+        Uses caching to avoid re-encoding previously seen texts.
+        Args:
+            texts: List of texts to encode
+        Returns:
+            2D numpy array of shape (len(texts), embedding_dim) with normalized embeddings
+        """
+        self._load_model()
+        # Separate cached from uncached texts
+        uncached_indices = []
+        uncached_texts = []
+        for i, text in enumerate(texts):
+            if text not in self._text_embedding_cache:
+                uncached_indices.append(i)
+                uncached_texts.append(text)
+        # Batch encode uncached texts
+        if uncached_texts:
+            embeddings = self._model.encode(uncached_texts, convert_to_numpy=True, show_progress_bar=False)
+            for i, (text, embedding) in enumerate(zip(uncached_texts, embeddings)):
+                norm = np.linalg.norm(embedding)
+                normalized = (embedding / (norm + 1e-8)).astype(np.float32)
+                self._text_embedding_cache[text] = normalized
+            logger.debug(f"Batch encoded {len(uncached_texts)} texts (cache size: {len(self._text_embedding_cache)})")
+        # Build result array from cache
+        result = np.stack([self._text_embedding_cache[text] for text in texts])
+        return result
+    def classify_batch(
         self,
-        text: str,
+        texts: list[str],
         top_k_categories: int = 3,
         min_score: float = 0.3,
-    ) -> list[tuple[str, str, float]]:
-        """Hierarchical classification: find categories, then all labels above threshold.
-        Returns all labels above the threshold, not just the best match.
+    ) -> list[list[tuple[str, str, float]]]:
+        """
+        Classify multiple texts in a single batch for efficiency.
         Args:
-            text: Text to classify
-            top_k_categories: Number of top categories to consider
+            texts: List of texts to classify
+            top_k_categories: Number of top categories to consider per text
             min_score: Minimum calibrated score to include in results
         Returns:
-            List of (category, label, confidence) tuples above threshold
+            List of classification results, one list per input text
         """
+        if not texts:
+            return []
         self._load_model()
         if not self._label_embeddings:
             raise RuntimeError("Label embeddings not pre-computed.")
-        # Check cache for input text embedding
-        if text in self._text_embedding_cache:
-            input_normalized = self._text_embedding_cache[text]
-        else:
-            input_embedding = self._model.encode(text, convert_to_numpy=True, show_progress_bar=False)
-            input_norm = np.linalg.norm(input_embedding)
-            input_normalized = (input_embedding / (input_norm + 1e-8)).astype(np.float32)
-            self._text_embedding_cache[text] = input_normalized
-            logger.debug(f"Cached embedding for text: '{text[:50]}...' (cache size: {len(self._text_embedding_cache)})")
-        # Compute average similarity to each category
-        category_scores: list[tuple[str, float]] = []
-        for category, labels in self._label_embeddings.items():
-            if not labels:
-                continue
+        # Batch encode all texts
+        input_embeddings = self.encode_batch(texts)
-            sims = []
-            for label_embedding in labels.values():
-                sim = float(np.dot(input_normalized, label_embedding))
-                sims.append(sim)
+        # Prepare label embeddings as matrices for vectorized similarity
+        all_results: list[list[tuple[str, str, float]]] = []
-            avg_sim = np.mean(sims)
-            category_scores.append((category, avg_sim))
+        for input_normalized in input_embeddings:
+            # Compute average similarity to each category
+            category_scores: list[tuple[str, float]] = []
+            for category, labels in self._label_embeddings.items():
+                if not labels:
+                    continue
-        category_scores.sort(key=lambda x: x[1], reverse=True)
+                sims = []
+                for label_embedding in labels.values():
+                    sim = float(np.dot(input_normalized, label_embedding))
+                    sims.append(sim)
-        results: list[tuple[str, str, float]] = []
+                avg_sim = np.mean(sims)
+                category_scores.append((category, avg_sim))
-        for category, _ in category_scores[:top_k_categories]:
-            for label, label_embedding in self._label_embeddings[category].items():
-                raw_sim = float(np.dot(input_normalized, label_embedding))
-                calibrated_score = self._calibrate_score(raw_sim)
+            category_scores.sort(key=lambda x: x[1], reverse=True)
-                if calibrated_score >= min_score:
-                    results.append((category, label, calibrated_score))
+            results: list[tuple[str, str, float]] = []
-        # Sort by confidence descending
-        results.sort(key=lambda x: x[2], reverse=True)
-        return results
+            for category, _ in category_scores[:top_k_categories]:
+                for label, label_embedding in self._label_embeddings[category].items():
+                    raw_sim = float(np.dot(input_normalized, label_embedding))
+                    calibrated_score = self._calibrate_score(raw_sim)
+                    if calibrated_score >= min_score:
+                        results.append((category, label, calibrated_score))
+            # Sort by confidence descending
+            results.sort(key=lambda x: x[2], reverse=True)
+            all_results.append(results)
+        return all_results
+    def classify_hierarchical(
+        self,
+        text: str,
+        top_k_categories: int = 3,
+        min_score: float = 0.3,
+    ) -> list[tuple[str, str, float]]:
+        """Hierarchical classification: find categories, then all labels above threshold.
+        Returns all labels above the threshold, not just the best match.
+        Args:
+            text: Text to classify
+            top_k_categories: Number of top categories to consider
+            min_score: Minimum calibrated score to include in results
+        Returns:
+            List of (category, label, confidence) tuples above threshold
+        """
+        # Use batch method for single text
+        results = self.classify_batch([text], top_k_categories, min_score)
+        return results[0] if results else []
 @PluginRegistry.taxonomy
@@ -223,7 +290,7 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
         self._top_k_categories = top_k_categories
         self._min_confidence = min_confidence
-        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
         self._classifier: Optional[EmbeddingClassifier] = None
         self._embeddings_computed = False
@@ -243,6 +310,16 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
     def description(self) -> str:
         return "Classifies statements using embedding similarity (faster than MNLI)"
+    @property
+    def model_vram_gb(self) -> float:
+        """EmbeddingGemma model weights ~1.2GB."""
+        return 1.2
+    @property
+    def per_item_vram_gb(self) -> float:
+        """Each text embedding ~0.05GB (embeddings are small)."""
+        return 0.05
     @property
     def taxonomy_name(self) -> str:
         return "esg_topics_embedding"
@@ -262,7 +339,7 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
     def supported_categories(self) -> list[str]:
         return self._categories.copy()
-    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+    def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
         if self._taxonomy is not None:
             return self._taxonomy
@@ -329,9 +406,79 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
     def _get_label_id(self, category: str, label: str) -> Optional[int]:
         taxonomy = self._load_taxonomy()
         if category in taxonomy:
-            return taxonomy[category].get(label)
+            entry = taxonomy[category].get(label)
+            if entry:
+                return entry.get("id")
         return None
+    def classify_batch(
+        self,
+        items: list[tuple[PipelineStatement, CanonicalEntity, CanonicalEntity]],
+        context: PipelineContext,
+    ) -> list[list[TaxonomyResult]]:
+        """
+        Classify multiple statements in a single batch for efficiency.
+        Batch encodes all source texts, then classifies each against the taxonomy.
+        Args:
+            items: List of (statement, subject_canonical, object_canonical) tuples
+            context: Pipeline context
+        Returns:
+            List of TaxonomyResult lists, one per input statement
+        """
+        if not items:
+            return []
+        # Extract unique source texts (may have duplicates across statements)
+        texts = [stmt.source_text for stmt, _, _ in items]
+        unique_texts = list(set(texts))
+        logger.info(f"Batch classifying {len(items)} statements ({len(unique_texts)} unique texts)")
+        try:
+            classifier = self._get_classifier()
+            # Batch classify all unique texts
+            batch_results = classifier.classify_batch(
+                unique_texts,
+                top_k_categories=self._top_k_categories,
+                min_score=self._min_confidence,
+            )
+            # Map unique texts to their classifications
+            text_to_results: dict[str, list[tuple[str, str, float]]] = {
+                text: results for text, results in zip(unique_texts, batch_results)
+            }
+            # Build results for each input statement
+            all_results: list[list[TaxonomyResult]] = []
+            for stmt, _, _ in items:
+                classifications = text_to_results.get(stmt.source_text, [])
+                results: list[TaxonomyResult] = []
+                for category, label, confidence in classifications:
+                    label_id = self._get_label_id(category, label)
+                    results.append(TaxonomyResult(
+                        taxonomy_name=self.taxonomy_name,
+                        category=category,
+                        label=label,
+                        label_id=label_id,
+                        confidence=round(confidence, 4),
+                        classifier=self.name,
+                    ))
+                all_results.append(results)
+            return all_results
+        except Exception as e:
+            logger.warning(f"Batch taxonomy classification failed: {e}")
+            # Return empty results for all items
+            return [[] for _ in items]
 # For testing without decorator
 EmbeddingTaxonomyClassifierClass = EmbeddingTaxonomyClassifier

statement_extractor/plugins/taxonomy/mnli.py CHANGED Viewed

@@ -8,9 +8,19 @@ where there are too many possible values for simple multi-choice classification.
 import json
 import logging
 from pathlib import Path
-from typing import Optional
+from typing import Optional, TypedDict
 from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
+class TaxonomyEntry(TypedDict):
+    """Structure for each taxonomy label entry."""
+    description: str
+    id: int
+    mnli_label: str
+    embedding_label: str
 from ...pipeline.context import PipelineContext
 from ...pipeline.registry import PluginRegistry
 from ...models import (
@@ -160,7 +170,7 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
         self._top_k_categories = top_k_categories
         self._min_confidence = min_confidence
-        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
         self._classifier: Optional[MNLIClassifier] = None
     @property
@@ -198,7 +208,7 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
     def supported_categories(self) -> list[str]:
         return self._categories.copy()
-    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+    def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
         """Load taxonomy from JSON file."""
         if self._taxonomy is not None:
             return self._taxonomy
@@ -271,7 +281,9 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
     def _get_label_id(self, category: str, label: str) -> Optional[int]:
         taxonomy = self._load_taxonomy()
         if category in taxonomy:
-            return taxonomy[category].get(label)
+            entry = taxonomy[category].get(label)
+            if entry:
+                return entry.get("id")
         return None

statement_extractor/scoring.py CHANGED Viewed

@@ -409,18 +409,18 @@ class BeamScorer:
         filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
         logger.debug(f"  After confidence filter (>={min_conf}): {len(filtered)} statements")
-        # # Filter out statements where source_text doesn't support the predicate
-        # # This catches model hallucinations where predicate doesn't match the evidence
-        # consistent = [
-        #     s for s in filtered
-        #     if self._source_text_supports_predicate(s)
-        # ]
-        # logger.debug(f"  After predicate consistency filter: {len(consistent)} statements")
+        # Filter out statements where source_text doesn't support the predicate
+        # This catches model hallucinations where predicate doesn't match the evidence
+        consistent = [
+            s for s in filtered
+            if self._source_text_supports_predicate(s)
+        ]
+        logger.debug(f"  After predicate consistency filter: {len(consistent)} statements")
         # Deduplicate - keep highest confidence for each (subject, predicate, object)
         # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
         seen: dict[tuple[str, str, str], Statement] = {}
-        for stmt in all_statements:
+        for stmt in consistent:
             key = (
                 stmt.subject.text.lower(),
                 stmt.predicate.lower(),

corp_extractor-0.5.0.dist-info/RECORD DELETED Viewed

@@ -1,55 +0,0 @@
-statement_extractor/__init__.py,sha256=Lmgw3jtwrfu09mXSfNFCB5AN0J6tsEQ2uOrrQciMrtI,3215
-statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
-statement_extractor/cli.py,sha256=iqsqvLAN0FMRoE4KskEoW-4DE5_7Tll8xeHA1t04KJg,25028
-statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
-statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
-statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
-statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
-statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
-statement_extractor/scoring.py,sha256=s_8nhavBNzPPFmGf2FyBummH4tgP7YGpXoMhl2Jh3Xw,16650
-statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
-statement_extractor/data/statement_taxonomy.json,sha256=XhCeVBC4aQB-7NR40Niu4yN2BmL0c2Gd-RKkUpsYK24,37981
-statement_extractor/models/__init__.py,sha256=gjTu450FPe9dvhIVQXqBwF8u0hgSnPORGXzxmSEuCnM,2564
-statement_extractor/models/canonical.py,sha256=ld6z6RtK03iOs_aUk8Rftcm0pUoaFpLUfyfbKI26N_o,4354
-statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
-statement_extractor/models/labels.py,sha256=e-mFDuzb42oJ69gLZTWCdg5_MNqRftQ2La5x8y9Cv-Y,6236
-statement_extractor/models/qualifiers.py,sha256=YkvyWh2p1fK5iMRDC2Dq1r-XJOmJ1rvWFTFUIkQ9zcc,3495
-statement_extractor/models/statement.py,sha256=cOgabA7IJxHYjlH5AksJRNf2Rv5VScMPqZdfjQyXRN0,2733
-statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
-statement_extractor/pipeline/config.py,sha256=rxZN27OWp05F-NaatwrYkjp56zbzHZ0hMtNU1mvBxgw,4130
-statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
-statement_extractor/pipeline/orchestrator.py,sha256=oHegnsDzXj87q8iAoi-QZj2ZyB1rX5qmg57BdIjvKo0,17617
-statement_extractor/pipeline/registry.py,sha256=qj5M5tMm9GmNCguy8dWBXMT8XmhemiZjJMktZsRlevw,11415
-statement_extractor/plugins/__init__.py,sha256=8k3lQGQNQSMUzxCmk4nAH8dIc1DqEnMyiqHlZZv81q0,1099
-statement_extractor/plugins/base.py,sha256=GZ4WT5S2mH3C_uN6nyBz-nGlAn_Z2o2A51FSRu6gCEo,12797
-statement_extractor/plugins/canonicalizers/__init__.py,sha256=LDb9NodyuLSoLzrLnNzMeviK79GHnyaLGU0J_02BBgM,421
-statement_extractor/plugins/canonicalizers/base.py,sha256=dbreQuEPB48eBJmah7hpl67azVU4QLhbvSrjXr0vT88,195
-statement_extractor/plugins/canonicalizers/location.py,sha256=Rz5SCM4bb0p0gsnHPzsQJv-RN59yoj9Z1NmF8yLQNv0,6590
-statement_extractor/plugins/canonicalizers/organization.py,sha256=L-mhdctkRXuu84RsNHp80M_tDIiMumYaHAG6WfxpH4c,7482
-statement_extractor/plugins/canonicalizers/person.py,sha256=Nw8FuJOBmg-cTaOTd2BJ1TZtydprfzIKL25wJa_VJek,6944
-statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
-statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
-statement_extractor/plugins/extractors/gliner2.py,sha256=rgfY8l9v8EWCxfB3g6hLnmLCIekTBkfWMG8dgSAZu-E,21627
-statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
-statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
-statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
-statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
-statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
-statement_extractor/plugins/labelers/taxonomy.py,sha256=jQp5emgWf6XgmOx7arh-owF_-TjVxiPKSJ2OGkTPbBs,12427
-statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=grvC_R_sg05hR6l0DgaELy2wmf6OkbvV1pRuNU0FVk4,16027
-statement_extractor/plugins/qualifiers/__init__.py,sha256=kefjGunlVDKLy2NXmtr5ZXyYi-swyQdPLkB-tHV_0vk,495
-statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
-statement_extractor/plugins/qualifiers/companies_house.py,sha256=_6ExJCjD0V4eZNYXtfBY99obqLpRaSv-G-V7N6R1wLg,5376
-statement_extractor/plugins/qualifiers/gleif.py,sha256=WZqcNT_Yq4yVe4rdkWO59C9yZ4geV2ZTDk9wxLlOeTg,5645
-statement_extractor/plugins/qualifiers/person.py,sha256=si_9CLjHsH9jYFugej4t0HMnsivclh-Yi70U6NglfIU,7101
-statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=3XDbizlR9YQgLrC7p-owV8Td-3TYaJlMb4B7saha3vw,6288
-statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
-statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
-statement_extractor/plugins/splitters/t5_gemma.py,sha256=8joOzlMKXhSyJaq5c3F8t-gdPcZEDiVAzNcMlgJAqsE,6733
-statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
-statement_extractor/plugins/taxonomy/embedding.py,sha256=QW1RR07JoE8Ah97gDZ_w_ATEe6-z2t2nl1zeTDAgFjM,11347
-statement_extractor/plugins/taxonomy/mnli.py,sha256=IzLjHXUFgVAgEvYI5EzOBs19UxvpcbJa8HjqI__tYII,8905
-corp_extractor-0.5.0.dist-info/METADATA,sha256=H4Z8ExZFdbknpHg-EZ1P9B137hCPwKXBezHSF7X9EOE,21567
-corp_extractor-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-corp_extractor-0.5.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
-corp_extractor-0.5.0.dist-info/RECORD,,

statement_extractor/plugins/canonicalizers/__init__.py DELETED Viewed

@@ -1,17 +0,0 @@
-"""
-Canonicalizer plugins for Stage 4 (Canonicalization).
-Resolves entities to their canonical forms.
-"""
-from .base import BaseCanonicalizerPlugin
-from .organization import OrganizationCanonicalizer
-from .person import PersonCanonicalizer
-from .location import LocationCanonicalizer
-__all__ = [
-    "BaseCanonicalizerPlugin",
-    "OrganizationCanonicalizer",
-    "PersonCanonicalizer",
-    "LocationCanonicalizer",
-]

statement_extractor/plugins/canonicalizers/base.py DELETED Viewed

@@ -1,9 +0,0 @@
-"""
-Base class for canonicalizer plugins.
-Re-exports BaseCanonicalizerPlugin from the main plugins module.
-"""
-from ..base import BaseCanonicalizerPlugin
-__all__ = ["BaseCanonicalizerPlugin"]

corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl