PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
corp_extractor-0.5.0.dist-info/RECORD +55 -0
statement_extractor/__init__.py +9 -0
statement_extractor/cli.py +446 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +1182 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +74 -0
statement_extractor/models/canonical.py +139 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +191 -0
statement_extractor/models/qualifiers.py +91 -0
statement_extractor/models/statement.py +75 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +134 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +447 -0
statement_extractor/pipeline/registry.py +297 -0
statement_extractor/plugins/__init__.py +43 -0
statement_extractor/plugins/base.py +446 -0
statement_extractor/plugins/canonicalizers/__init__.py +17 -0
statement_extractor/plugins/canonicalizers/base.py +9 -0
statement_extractor/plugins/canonicalizers/location.py +219 -0
statement_extractor/plugins/canonicalizers/organization.py +230 -0
statement_extractor/plugins/canonicalizers/person.py +242 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +536 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +373 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
statement_extractor/plugins/qualifiers/__init__.py +19 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +174 -0
statement_extractor/plugins/qualifiers/gleif.py +186 -0
statement_extractor/plugins/qualifiers/person.py +221 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +188 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +337 -0
statement_extractor/plugins/taxonomy/mnli.py +279 -0
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/qualifiers/gleif.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""
+GLEIFQualifierPlugin - Qualifies ORG entities with LEI and related data.
+Uses the GLEIF (Global Legal Entity Identifier Foundation) API to:
+- Look up LEI by organization name
+- Retrieve legal name, jurisdiction, parent company info
+"""
+import logging
+from typing import Optional
+from urllib.parse import quote
+from ..base import BaseQualifierPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import ExtractedEntity, EntityQualifiers, EntityType
+logger = logging.getLogger(__name__)
+# GLEIF API base URL
+GLEIF_API_BASE = "https://api.gleif.org/api/v1"
+@PluginRegistry.qualifier
+class GLEIFQualifierPlugin(BaseQualifierPlugin):
+    """
+    Qualifier plugin for ORG entities using GLEIF API.
+    Looks up Legal Entity Identifiers (LEI) and related corporate data.
+    """
+    def __init__(
+        self,
+        timeout: int = 10,
+        cache_results: bool = True,
+    ):
+        """
+        Initialize the GLEIF qualifier.
+        Args:
+            timeout: API request timeout in seconds
+            cache_results: Whether to cache API results
+        """
+        self._timeout = timeout
+        self._cache_results = cache_results
+        self._cache: dict[str, Optional[dict]] = {}
+    @property
+    def name(self) -> str:
+        return "gleif_qualifier"
+    @property
+    def priority(self) -> int:
+        return 10  # High priority for ORG entities
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
+    @property
+    def description(self) -> str:
+        return "Looks up LEI and corporate data from GLEIF API"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.ORG}
+    @property
+    def supported_identifier_types(self) -> list[str]:
+        return ["lei"]  # Can lookup by existing LEI
+    @property
+    def provided_identifier_types(self) -> list[str]:
+        return ["lei"]  # Provides LEI
+    def qualify(
+        self,
+        entity: ExtractedEntity,
+        context: PipelineContext,
+    ) -> Optional[EntityQualifiers]:
+        """
+        Qualify an ORG entity with GLEIF data.
+        Args:
+            entity: The ORG entity to qualify
+            context: Pipeline context
+        Returns:
+            EntityQualifiers with LEI and jurisdiction, or None if not found
+        """
+        if entity.type != EntityType.ORG:
+            return None
+        # Check cache first
+        cache_key = entity.text.lower().strip()
+        if self._cache_results and cache_key in self._cache:
+            cached = self._cache[cache_key]
+            if cached is None:
+                return None
+            return self._data_to_qualifiers(cached)
+        # Search GLEIF API
+        result = self._search_gleif(entity.text)
+        # Cache result
+        if self._cache_results:
+            self._cache[cache_key] = result
+        if result:
+            return self._data_to_qualifiers(result)
+        return None
+    def _search_gleif(self, org_name: str) -> Optional[dict]:
+        """Search GLEIF API for organization."""
+        try:
+            import requests
+            # Fuzzy name search
+            url = f"{GLEIF_API_BASE}/lei-records"
+            params = {
+                "filter[entity.legalName]": org_name,
+                "page[size]": 5,
+            }
+            response = requests.get(url, params=params, timeout=self._timeout)
+            response.raise_for_status()
+            data = response.json()
+            records = data.get("data", [])
+            if not records:
+                # Try fulltext search as fallback
+                params = {
+                    "filter[fulltext]": org_name,
+                    "page[size]": 5,
+                }
+                response = requests.get(url, params=params, timeout=self._timeout)
+                response.raise_for_status()
+                data = response.json()
+                records = data.get("data", [])
+            if records:
+                # Return first match
+                record = records[0]
+                return self._parse_lei_record(record)
+        except ImportError:
+            logger.warning("requests library not available for GLEIF API")
+        except Exception as e:
+            logger.debug(f"GLEIF API error: {e}")
+        return None
+    def _parse_lei_record(self, record: dict) -> dict:
+        """Parse a GLEIF LEI record into a simplified dict."""
+        attrs = record.get("attributes", {})
+        entity = attrs.get("entity", {})
+        legal_name = entity.get("legalName", {}).get("name", "")
+        legal_address = entity.get("legalAddress", {})
+        jurisdiction = entity.get("jurisdiction", "")
+        return {
+            "lei": record.get("id", ""),
+            "legal_name": legal_name,
+            "jurisdiction": jurisdiction,
+            "country": legal_address.get("country", ""),
+            "city": legal_address.get("city", ""),
+            "status": attrs.get("registration", {}).get("status", ""),
+        }
+    def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
+        """Convert GLEIF data to EntityQualifiers."""
+        identifiers = {}
+        if data.get("lei"):
+            identifiers["lei"] = data["lei"]
+        return EntityQualifiers(
+            jurisdiction=data.get("jurisdiction"),
+            country=data.get("country"),
+            city=data.get("city"),
+            identifiers=identifiers,
+        )
+# Allow importing without decorator for testing
+GLEIFQualifierPluginClass = GLEIFQualifierPlugin

statement_extractor/plugins/qualifiers/person.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""
+PersonQualifierPlugin - Qualifies PERSON entities with role and organization.
+Uses Gemma3 12B (instruction-tuned) to extract:
+- role: Job title/position (e.g., "CEO", "President")
+- org: Organization/employer (e.g., "Apple Inc", "Microsoft")
+"""
+import json
+import logging
+import re
+from typing import Optional
+from ..base import BaseQualifierPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import ExtractedEntity, EntityQualifiers, EntityType
+from ...llm import LLM
+logger = logging.getLogger(__name__)
+@PluginRegistry.qualifier
+class PersonQualifierPlugin(BaseQualifierPlugin):
+    """
+    Qualifier plugin for PERSON entities.
+    Uses Gemma3 12B to extract role and organization from context.
+    Falls back to pattern matching if model is not available.
+    """
+    # Common role patterns for fallback
+    ROLE_PATTERNS = [
+        r"\b(CEO|CFO|CTO|COO|CMO|CIO|CISO|CSO)\b",
+        r"\b(Chief\s+\w+\s+Officer)\b",
+        r"\b(President|Chairman|Director|Manager|Executive|Founder|Co-Founder)\b",
+        r"\b(Vice\s+President|VP)\b",
+        r"\b(Head\s+of\s+\w+)\b",
+        r"\b(Senior\s+\w+|Lead\s+\w+|Principal\s+\w+)\b",
+    ]
+    def __init__(
+        self,
+        model_id: str = "google/gemma-3-12b-it-qat-q4_0-gguf",
+        gguf_file: Optional[str] = None,
+        use_llm: bool = True,
+        use_4bit: bool = True,
+    ):
+        """
+        Initialize the person qualifier.
+        Args:
+            model_id: HuggingFace model ID for LLM qualification
+            gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
+            use_llm: Whether to use LLM
+            use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
+        """
+        self._use_llm = use_llm
+        self._llm: Optional[LLM] = None
+        if use_llm:
+            self._llm = LLM(
+                model_id=model_id,
+                gguf_file=gguf_file,
+                use_4bit=use_4bit,
+            )
+    @property
+    def name(self) -> str:
+        return "person_qualifier"
+    @property
+    def priority(self) -> int:
+        return 10  # High priority for PERSON entities
+    @property
+    def capabilities(self) -> PluginCapability:
+        caps = PluginCapability.NONE
+        if self._use_llm:
+            caps |= PluginCapability.LLM_REQUIRED
+        return caps
+    @property
+    def description(self) -> str:
+        return "Extracts role and organization for PERSON entities using Gemma3"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.PERSON}
+    @property
+    def provided_identifier_types(self) -> list[str]:
+        return []  # Provides qualifiers, not identifiers
+    def qualify(
+        self,
+        entity: ExtractedEntity,
+        context: PipelineContext,
+    ) -> Optional[EntityQualifiers]:
+        """
+        Qualify a PERSON entity with role and organization.
+        Args:
+            entity: The PERSON entity to qualify
+            context: Pipeline context for accessing source text
+        Returns:
+            EntityQualifiers with role and org, or None if nothing found
+        """
+        if entity.type != EntityType.PERSON:
+            return None
+        # Use the full source text for LLM qualification
+        # This provides maximum context for understanding the person's role/org
+        full_text = context.source_text
+        # Try LLM extraction first with full text
+        if self._llm is not None:
+            result = self._extract_with_llm(entity.text, full_text)
+            if result and (result.role or result.org):
+                return result
+        # Fallback to pattern matching with full text
+        return self._extract_with_patterns(entity.text, full_text)
+    def _extract_with_llm(
+        self,
+        person_name: str,
+        context_text: str,
+    ) -> Optional[EntityQualifiers]:
+        """Extract role and org using Gemma3."""
+        if self._llm is None:
+            return None
+        try:
+            prompt = f"""Extract qualifiers for a person from the given context.
+Instructions:
+- "role" = job title or position (e.g., "CEO", "President", "Director")
+- "org" = company or organization name (e.g., "Amazon", "Apple Inc", "Microsoft")
+- These are DIFFERENT things: role is a job title, org is a company name
+- Return null for fields not mentioned in the context
+Return ONLY valid JSON:
+E.g.
+<context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
+<person>James</person>
+Should return:
+{{"role": "CEO", "org": "Big Ducks Quacking Inc"}}
+---
+<context>{context_text}</context>
+<person>{person_name}</person>
+"""
+            logger.debug(f"LLM request: {prompt}")
+            response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
+            logger.debug(f"LLM response: {response}")
+            # Extract JSON from response
+            json_match = re.search(r'\{[^}]+\}', response)
+            if json_match:
+                data = json.loads(json_match.group())
+                role = data.get("role")
+                org = data.get("org")
+                # Validate: role and org should be different (reject if same)
+                if role and org and role.lower() == org.lower():
+                    logger.debug(f"Rejected duplicate role/org: {role}")
+                    org = None  # Clear org if it's same as role
+                if role or org:
+                    return EntityQualifiers(role=role, org=org)
+        except Exception as e:
+            logger.exception(f"LLM extraction failed: {e}")
+            raise e
+        return None
+    def _extract_with_patterns(
+        self,
+        person_name: str,
+        context_text: str,
+    ) -> Optional[EntityQualifiers]:
+        """Extract role and org using pattern matching."""
+        role = None
+        org = None
+        # Look for role patterns
+        for pattern in self.ROLE_PATTERNS:
+            match = re.search(pattern, context_text, re.IGNORECASE)
+            if match:
+                role = match.group(1)
+                break
+        # Look for "of [Organization]" or "at [Organization]" patterns
+        org_patterns = [
+            rf'{re.escape(person_name)}[^.]*?\bof\s+([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)',
+            rf'{re.escape(person_name)}[^.]*?\bat\s+([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)',
+            rf'([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)\s*(?:\'s|s)?\s*{re.escape(person_name)}',
+        ]
+        for pattern in org_patterns:
+            match = re.search(pattern, context_text)
+            if match:
+                org = match.group(1).strip()
+                # Clean up trailing punctuation
+                org = org.rstrip('.,;')
+                break
+        if role or org:
+            return EntityQualifiers(role=role, org=org)
+        return None
+# Allow importing without decorator for testing
+PersonQualifierPluginClass = PersonQualifierPlugin

statement_extractor/plugins/qualifiers/sec_edgar.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""
+SECEdgarQualifierPlugin - Qualifies US ORG entities with SEC data.
+Uses the SEC EDGAR API to:
+- Look up CIK (Central Index Key) by company name
+- Retrieve ticker symbol, exchange, filing history
+"""
+import logging
+from typing import Optional
+from ..base import BaseQualifierPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import ExtractedEntity, EntityQualifiers, EntityType
+logger = logging.getLogger(__name__)
+# SEC EDGAR API endpoints
+SEC_COMPANY_SEARCH = "https://efts.sec.gov/LATEST/search-index"
+SEC_COMPANY_TICKERS = "https://www.sec.gov/files/company_tickers.json"
+@PluginRegistry.qualifier
+class SECEdgarQualifierPlugin(BaseQualifierPlugin):
+    """
+    Qualifier plugin for US ORG entities using SEC EDGAR.
+    Provides CIK and ticker symbol for publicly traded US companies.
+    """
+    def __init__(
+        self,
+        timeout: int = 10,
+        cache_results: bool = True,
+    ):
+        """
+        Initialize the SEC EDGAR qualifier.
+        Args:
+            timeout: API request timeout in seconds
+            cache_results: Whether to cache API results
+        """
+        self._timeout = timeout
+        self._cache_results = cache_results
+        self._cache: dict[str, Optional[dict]] = {}
+        self._ticker_cache: Optional[dict] = None
+    @property
+    def name(self) -> str:
+        return "sec_edgar_qualifier"
+    @property
+    def priority(self) -> int:
+        return 30  # Run after GLEIF and Companies House
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
+    @property
+    def description(self) -> str:
+        return "Looks up SEC CIK and ticker for US public companies"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.ORG}
+    @property
+    def supported_identifier_types(self) -> list[str]:
+        return ["sec_cik", "ticker"]  # Can lookup by CIK or ticker
+    @property
+    def provided_identifier_types(self) -> list[str]:
+        return ["sec_cik", "ticker"]  # Provides CIK and ticker
+    def qualify(
+        self,
+        entity: ExtractedEntity,
+        context: PipelineContext,
+    ) -> Optional[EntityQualifiers]:
+        """
+        Qualify an ORG entity with SEC EDGAR data.
+        Args:
+            entity: The ORG entity to qualify
+            context: Pipeline context
+        Returns:
+            EntityQualifiers with CIK and ticker, or None if not found
+        """
+        if entity.type != EntityType.ORG:
+            return None
+        # Check cache first
+        cache_key = entity.text.lower().strip()
+        if self._cache_results and cache_key in self._cache:
+            cached = self._cache[cache_key]
+            if cached is None:
+                return None
+            return self._data_to_qualifiers(cached)
+        # Search SEC
+        result = self._search_sec(entity.text)
+        # Cache result
+        if self._cache_results:
+            self._cache[cache_key] = result
+        if result:
+            return self._data_to_qualifiers(result)
+        return None
+    def _load_ticker_cache(self) -> dict:
+        """Load the SEC company tickers JSON (cached)."""
+        if self._ticker_cache is not None:
+            return self._ticker_cache
+        try:
+            import requests
+            response = requests.get(SEC_COMPANY_TICKERS, timeout=self._timeout)
+            response.raise_for_status()
+            data = response.json()
+            # Build lookup by company name (lowercase)
+            self._ticker_cache = {}
+            for key, company in data.items():
+                name = company.get("title", "").lower()
+                if name:
+                    self._ticker_cache[name] = {
+                        "cik": str(company.get("cik_str", "")),
+                        "ticker": company.get("ticker", ""),
+                        "title": company.get("title", ""),
+                    }
+            logger.debug(f"Loaded {len(self._ticker_cache)} SEC company tickers")
+            return self._ticker_cache
+        except Exception as e:
+            logger.debug(f"Failed to load SEC ticker cache: {e}")
+            self._ticker_cache = {}
+            return self._ticker_cache
+    def _search_sec(self, org_name: str) -> Optional[dict]:
+        """Search SEC for company information."""
+        try:
+            # Load ticker cache
+            ticker_cache = self._load_ticker_cache()
+            # Try exact match first
+            org_lower = org_name.lower().strip()
+            if org_lower in ticker_cache:
+                return ticker_cache[org_lower]
+            # Try partial match
+            for name, data in ticker_cache.items():
+                if org_lower in name or name in org_lower:
+                    return data
+            # Try matching without common suffixes
+            clean_name = org_lower
+            for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
+                clean_name = clean_name.replace(suffix, "")
+            clean_name = clean_name.strip()
+            for name, data in ticker_cache.items():
+                clean_cached = name
+                for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
+                    clean_cached = clean_cached.replace(suffix, "")
+                clean_cached = clean_cached.strip()
+                if clean_name == clean_cached or clean_name in clean_cached or clean_cached in clean_name:
+                    return data
+        except Exception as e:
+            logger.debug(f"SEC search error: {e}")
+        return None
+    def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
+        """Convert SEC data to EntityQualifiers."""
+        identifiers = {}
+        if data.get("cik"):
+            identifiers["sec_cik"] = data["cik"]
+        if data.get("ticker"):
+            identifiers["ticker"] = data["ticker"]
+        return EntityQualifiers(
+            jurisdiction="US",
+            country="US",
+            identifiers=identifiers,
+        )
+# Allow importing without decorator for testing
+SECEdgarQualifierPluginClass = SECEdgarQualifierPlugin

statement_extractor/plugins/splitters/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+Splitter plugins for Stage 1 (Splitting).
+Splits text into atomic triples.
+"""
+from .base import BaseSplitterPlugin
+from .t5_gemma import T5GemmaSplitter
+__all__ = [
+    "BaseSplitterPlugin",
+    "T5GemmaSplitter",
+]

statement_extractor/plugins/splitters/base.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Base class for splitter plugins.
+Re-exports BaseSplitterPlugin from the main plugins module.
+"""
+from ..base import BaseSplitterPlugin
+__all__ = ["BaseSplitterPlugin"]

corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl