PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
corp_extractor-0.5.0.dist-info/RECORD +55 -0
statement_extractor/__init__.py +9 -0
statement_extractor/cli.py +446 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +1182 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +74 -0
statement_extractor/models/canonical.py +139 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +191 -0
statement_extractor/models/qualifiers.py +91 -0
statement_extractor/models/statement.py +75 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +134 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +447 -0
statement_extractor/pipeline/registry.py +297 -0
statement_extractor/plugins/__init__.py +43 -0
statement_extractor/plugins/base.py +446 -0
statement_extractor/plugins/canonicalizers/__init__.py +17 -0
statement_extractor/plugins/canonicalizers/base.py +9 -0
statement_extractor/plugins/canonicalizers/location.py +219 -0
statement_extractor/plugins/canonicalizers/organization.py +230 -0
statement_extractor/plugins/canonicalizers/person.py +242 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +536 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +373 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
statement_extractor/plugins/qualifiers/__init__.py +19 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +174 -0
statement_extractor/plugins/qualifiers/gleif.py +186 -0
statement_extractor/plugins/qualifiers/person.py +221 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +188 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +337 -0
statement_extractor/plugins/taxonomy/mnli.py +279 -0
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/canonicalizers/organization.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""
+OrganizationCanonicalizer - Resolves ORG entities to canonical forms.
+Uses a tiered matching approach:
+1. LEI exact match (confidence 1.0)
+2. Company number + jurisdiction (confidence 0.95)
+3. Trigram fuzzy name match (confidence 0.85+)
+4. LLM verification for uncertain matches (confidence 0.6-0.85)
+"""
+import logging
+import re
+from typing import Optional
+from ..base import BaseCanonicalizerPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import QualifiedEntity, CanonicalMatch, EntityType
+logger = logging.getLogger(__name__)
+# Common organization suffixes to normalize
+ORG_SUFFIXES = [
+    r'\s+Inc\.?$', r'\s+Corp\.?$', r'\s+Corporation$',
+    r'\s+Ltd\.?$', r'\s+Limited$', r'\s+LLC$',
+    r'\s+LLP$', r'\s+PLC$', r'\s+Co\.?$',
+    r'\s+Company$', r'\s+Group$', r'\s+Holdings$',
+    r'\s+&\s+Co\.?$', r',\s+Inc\.?$',
+]
+def normalize_org_name(name: str) -> str:
+    """Normalize organization name for matching."""
+    normalized = name.strip()
+    for pattern in ORG_SUFFIXES:
+        normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE)
+    # Strip trailing punctuation (commas, periods) that may be left after suffix removal
+    normalized = re.sub(r'[,.\s]+$', '', normalized)
+    return normalized.strip().lower()
+def trigram_similarity(a: str, b: str) -> float:
+    """Calculate trigram similarity between two strings."""
+    if not a or not b:
+        return 0.0
+    def get_trigrams(s: str) -> set:
+        s = s.lower()
+        return {s[i:i+3] for i in range(len(s) - 2)} if len(s) >= 3 else {s}
+    trigrams_a = get_trigrams(a)
+    trigrams_b = get_trigrams(b)
+    if not trigrams_a or not trigrams_b:
+        return 0.0
+    intersection = len(trigrams_a & trigrams_b)
+    union = len(trigrams_a | trigrams_b)
+    return intersection / union if union > 0 else 0.0
+@PluginRegistry.canonicalizer
+class OrganizationCanonicalizer(BaseCanonicalizerPlugin):
+    """
+    Canonicalizer for ORG entities.
+    Uses tiered matching approach with identifier, name, and fuzzy matching.
+    """
+    def __init__(
+        self,
+        fuzzy_threshold: float = 0.75,
+        use_llm_verification: bool = False,
+    ):
+        """
+        Initialize the organization canonicalizer.
+        Args:
+            fuzzy_threshold: Minimum trigram similarity for fuzzy matches
+            use_llm_verification: Whether to use LLM for uncertain matches
+        """
+        self._fuzzy_threshold = fuzzy_threshold
+        self._use_llm_verification = use_llm_verification
+    @property
+    def name(self) -> str:
+        return "organization_canonicalizer"
+    @property
+    def priority(self) -> int:
+        return 10
+    @property
+    def capabilities(self) -> PluginCapability:
+        caps = PluginCapability.CACHING
+        if self._use_llm_verification:
+            caps |= PluginCapability.LLM_REQUIRED
+        return caps
+    @property
+    def description(self) -> str:
+        return "Resolves ORG entities to canonical forms using identifier and name matching"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.ORG}
+    def find_canonical(
+        self,
+        entity: QualifiedEntity,
+        context: PipelineContext,
+    ) -> Optional[CanonicalMatch]:
+        """
+        Find canonical form for an ORG entity.
+        Args:
+            entity: Qualified entity to canonicalize
+            context: Pipeline context
+        Returns:
+            CanonicalMatch if found
+        """
+        qualifiers = entity.qualifiers
+        identifiers = qualifiers.identifiers
+        # Tier 1: LEI exact match
+        if "lei" in identifiers:
+            return CanonicalMatch(
+                canonical_id=identifiers["lei"],
+                canonical_name=entity.original_text,
+                match_method="identifier",
+                match_confidence=1.0,
+                match_details={"identifier_type": "lei"},
+            )
+        # Tier 2: Company number + jurisdiction
+        if "ch_number" in identifiers and qualifiers.jurisdiction:
+            return CanonicalMatch(
+                canonical_id=f"{qualifiers.jurisdiction}:{identifiers['ch_number']}",
+                canonical_name=entity.original_text,
+                match_method="identifier",
+                match_confidence=0.95,
+                match_details={"identifier_type": "ch_number", "jurisdiction": qualifiers.jurisdiction},
+            )
+        if "sec_cik" in identifiers:
+            ticker = identifiers.get("ticker", "")
+            canonical_name = f"{entity.original_text} ({ticker})" if ticker else entity.original_text
+            return CanonicalMatch(
+                canonical_id=f"SEC:{identifiers['sec_cik']}",
+                canonical_name=canonical_name,
+                match_method="identifier",
+                match_confidence=0.95,
+                match_details={"identifier_type": "sec_cik", "ticker": ticker},
+            )
+        # Tier 3: Fuzzy name match against other ORG entities in context
+        best_match = self._find_fuzzy_match(entity, context)
+        if best_match:
+            return best_match
+        # No canonical match found
+        return None
+    def _find_fuzzy_match(
+        self,
+        entity: QualifiedEntity,
+        context: PipelineContext,
+    ) -> Optional[CanonicalMatch]:
+        """Find fuzzy matches against other ORG entities in context."""
+        normalized_name = normalize_org_name(entity.original_text)
+        best_match = None
+        best_similarity = 0.0
+        for other_ref, other_entity in context.qualified_entities.items():
+            if other_ref == entity.entity_ref:
+                continue
+            if other_entity.entity_type != EntityType.ORG:
+                continue
+            other_normalized = normalize_org_name(other_entity.original_text)
+            similarity = trigram_similarity(normalized_name, other_normalized)
+            if similarity > best_similarity and similarity >= self._fuzzy_threshold:
+                best_similarity = similarity
+                best_match = other_entity
+        if best_match and best_similarity >= self._fuzzy_threshold:
+            confidence = 0.85 + (best_similarity - self._fuzzy_threshold) * 0.1
+            confidence = min(confidence, 0.95)
+            return CanonicalMatch(
+                canonical_id=None,
+                canonical_name=best_match.original_text,
+                match_method="name_fuzzy",
+                match_confidence=confidence,
+                match_details={"similarity": best_similarity, "matched_entity": best_match.entity_ref},
+            )
+        return None
+    def format_fqn(
+        self,
+        entity: QualifiedEntity,
+        match: Optional[CanonicalMatch],
+    ) -> str:
+        """Format FQN for an organization."""
+        base_name = match.canonical_name if match else entity.original_text
+        parts = []
+        identifiers = entity.qualifiers.identifiers
+        # Add ticker if available
+        if "ticker" in identifiers:
+            parts.append(identifiers["ticker"])
+        # Add jurisdiction
+        if entity.qualifiers.jurisdiction:
+            parts.append(entity.qualifiers.jurisdiction)
+        if parts:
+            return f"{base_name} ({', '.join(parts)})"
+        return base_name
+# Allow importing without decorator for testing
+OrganizationCanonicalizerClass = OrganizationCanonicalizer

statement_extractor/plugins/canonicalizers/person.py ADDED Viewed

@@ -0,0 +1,242 @@
+"""
+PersonCanonicalizer - Resolves PERSON entities to canonical forms.
+Uses:
+1. Name variants (Tim vs Timothy)
+2. Role + org context matching
+3. LLM identity verification for uncertain matches
+"""
+import logging
+import re
+from typing import Optional
+from ..base import BaseCanonicalizerPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...pipeline.registry import PluginRegistry
+from ...models import QualifiedEntity, CanonicalMatch, EntityType
+logger = logging.getLogger(__name__)
+# Common name variations
+NAME_VARIANTS = {
+    "tim": ["timothy"],
+    "timothy": ["tim"],
+    "mike": ["michael"],
+    "michael": ["mike"],
+    "bob": ["robert"],
+    "robert": ["bob", "rob"],
+    "rob": ["robert"],
+    "bill": ["william"],
+    "william": ["bill", "will"],
+    "will": ["william"],
+    "jim": ["james"],
+    "james": ["jim", "jimmy"],
+    "jimmy": ["james"],
+    "tom": ["thomas"],
+    "thomas": ["tom", "tommy"],
+    "joe": ["joseph"],
+    "joseph": ["joe"],
+    "alex": ["alexander", "alexandra"],
+    "alexander": ["alex"],
+    "alexandra": ["alex"],
+    "dan": ["daniel"],
+    "daniel": ["dan", "danny"],
+    "dave": ["david"],
+    "david": ["dave"],
+    "ed": ["edward", "edwin"],
+    "edward": ["ed", "eddie"],
+    "jen": ["jennifer"],
+    "jennifer": ["jen", "jenny"],
+    "kate": ["katherine", "catherine"],
+    "katherine": ["kate", "kathy"],
+    "catherine": ["kate", "cathy"],
+    "chris": ["christopher", "christine"],
+    "christopher": ["chris"],
+    "christine": ["chris"],
+    "matt": ["matthew"],
+    "matthew": ["matt"],
+    "nick": ["nicholas"],
+    "nicholas": ["nick"],
+    "sam": ["samuel", "samantha"],
+    "samuel": ["sam"],
+    "samantha": ["sam"],
+    "steve": ["steven", "stephen"],
+    "steven": ["steve"],
+    "stephen": ["steve"],
+}
+def normalize_person_name(name: str) -> str:
+    """Normalize a person name for matching."""
+    # Remove titles
+    name = re.sub(r'^(Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Prof\.?)\s+', '', name, flags=re.IGNORECASE)
+    # Remove suffixes
+    name = re.sub(r'\s+(Jr\.?|Sr\.?|III?|IV|V)$', '', name, flags=re.IGNORECASE)
+    return name.strip().lower()
+def get_name_parts(name: str) -> tuple[str, str]:
+    """Split name into first and last name."""
+    normalized = normalize_person_name(name)
+    parts = normalized.split()
+    if len(parts) >= 2:
+        return parts[0], parts[-1]
+    elif len(parts) == 1:
+        return parts[0], ""
+    return "", ""
+def names_match(name1: str, name2: str) -> tuple[bool, float, bool]:
+    """
+    Check if two names match, considering variants.
+    Returns (matches, confidence, is_variant).
+    """
+    first1, last1 = get_name_parts(name1)
+    first2, last2 = get_name_parts(name2)
+    # Last names must match (if both present)
+    if last1 and last2 and last1 != last2:
+        return False, 0.0, False
+    # Check first name match
+    if first1 == first2:
+        return True, 1.0, False
+    # Check variants
+    variants1 = NAME_VARIANTS.get(first1, [])
+    variants2 = NAME_VARIANTS.get(first2, [])
+    if first2 in variants1 or first1 in variants2:
+        return True, 0.9, True
+    return False, 0.0, False
+@PluginRegistry.canonicalizer
+class PersonCanonicalizer(BaseCanonicalizerPlugin):
+    """
+    Canonicalizer for PERSON entities.
+    Uses name variants and context matching.
+    """
+    def __init__(
+        self,
+        use_context_matching: bool = True,
+    ):
+        """
+        Initialize the person canonicalizer.
+        Args:
+            use_context_matching: Whether to use role+org for disambiguation
+        """
+        self._use_context_matching = use_context_matching
+    @property
+    def name(self) -> str:
+        return "person_canonicalizer"
+    @property
+    def priority(self) -> int:
+        return 10
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.CACHING
+    @property
+    def description(self) -> str:
+        return "Resolves PERSON entities using name variants and context"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.PERSON}
+    def find_canonical(
+        self,
+        entity: QualifiedEntity,
+        context: PipelineContext,
+    ) -> Optional[CanonicalMatch]:
+        """
+        Find canonical form for a PERSON entity.
+        Args:
+            entity: Qualified entity to canonicalize
+            context: Pipeline context
+        Returns:
+            CanonicalMatch if found
+        """
+        # Look for matching PERSON entities in context
+        best_match = None
+        best_confidence = 0.0
+        best_is_variant = False
+        for other_ref, other_entity in context.qualified_entities.items():
+            if other_ref == entity.entity_ref:
+                continue
+            if other_entity.entity_type != EntityType.PERSON:
+                continue
+            # Check name match
+            matches, confidence, is_variant = names_match(entity.original_text, other_entity.original_text)
+            if not matches:
+                continue
+            # Boost confidence if role+org also match
+            if self._use_context_matching and confidence > 0:
+                my_qualifiers = entity.qualifiers
+                other_qualifiers = other_entity.qualifiers
+                if my_qualifiers.role and other_qualifiers.role:
+                    if my_qualifiers.role.lower() == other_qualifiers.role.lower():
+                        confidence = min(confidence + 0.05, 1.0)
+                if my_qualifiers.org and other_qualifiers.org:
+                    if my_qualifiers.org.lower() == other_qualifiers.org.lower():
+                        confidence = min(confidence + 0.05, 1.0)
+            if confidence > best_confidence:
+                best_confidence = confidence
+                best_match = other_entity
+                best_is_variant = is_variant
+        if best_match and best_confidence >= 0.8:
+            return CanonicalMatch(
+                canonical_id=None,
+                canonical_name=best_match.original_text,
+                match_method="name_variant" if best_is_variant else "name_exact",
+                match_confidence=best_confidence,
+                match_details={"matched_entity": best_match.entity_ref},
+            )
+        return None
+    def format_fqn(
+        self,
+        entity: QualifiedEntity,
+        match: Optional[CanonicalMatch],
+    ) -> str:
+        """Format FQN for a person."""
+        base_name = match.canonical_name if match else entity.original_text
+        parts = []
+        qualifiers = entity.qualifiers
+        if qualifiers.role:
+            parts.append(qualifiers.role)
+        if qualifiers.org:
+            parts.append(qualifiers.org)
+        if parts:
+            return f"{base_name} ({', '.join(parts)})"
+        return base_name
+# Allow importing without decorator for testing
+PersonCanonicalizerClass = PersonCanonicalizer

statement_extractor/plugins/extractors/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+Extractor plugins for Stage 2 (Extraction).
+Refines raw triples into statements with typed entities.
+"""
+from .base import BaseExtractorPlugin
+from .gliner2 import GLiNER2Extractor
+__all__ = [
+    "BaseExtractorPlugin",
+    "GLiNER2Extractor",
+]

statement_extractor/plugins/extractors/base.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Base class for extractor plugins.
+Re-exports BaseExtractorPlugin from the main plugins module.
+"""
+from ..base import BaseExtractorPlugin
+__all__ = ["BaseExtractorPlugin"]

corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl