PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +1227 -10
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/models/__init__.py +16 -1
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +26 -0
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/orchestrator.py +80 -111
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +334 -64
statement_extractor/plugins/extractors/gliner2.py +10 -0
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +578 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +158 -53
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/database/models.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""
+Pydantic models for organization/entity database records.
+"""
+from enum import Enum
+from typing import Any, Literal, Optional
+from pydantic import BaseModel, Field
+SourceType = Literal["gleif", "sec_edgar", "companies_house", "wikipedia"]
+class EntityType(str, Enum):
+    """
+    Classification of organization type.
+    Used to distinguish between businesses, non-profits, government agencies, etc.
+    """
+    # Business entities
+    BUSINESS = "business"  # General business/company
+    FUND = "fund"  # Investment funds, ETFs, mutual funds
+    BRANCH = "branch"  # Branch offices of companies
+    # Non-profit/civil society
+    NONPROFIT = "nonprofit"  # Non-profit organizations
+    NGO = "ngo"  # Non-governmental organizations
+    FOUNDATION = "foundation"  # Charitable foundations
+    TRADE_UNION = "trade_union"  # Labor unions
+    # Government/public sector
+    GOVERNMENT = "government"  # Government agencies
+    INTERNATIONAL_ORG = "international_org"  # UN, WHO, IMF, etc.
+    POLITICAL_PARTY = "political_party"  # Political parties
+    # Education/research
+    EDUCATIONAL = "educational"  # Schools, universities
+    RESEARCH = "research"  # Research institutes
+    # Other organization types
+    RELIGIOUS = "religious"  # Religious organizations
+    SPORTS = "sports"  # Sports clubs/teams
+    MEDIA = "media"  # Media companies, studios
+    HEALTHCARE = "healthcare"  # Hospitals, healthcare orgs
+    # Unknown/unclassified
+    UNKNOWN = "unknown"  # Type not determined
+class PersonType(str, Enum):
+    """
+    Classification of notable person type.
+    Used for categorizing people in the person database.
+    """
+    EXECUTIVE = "executive"  # CEOs, board members, C-suite
+    POLITICIAN = "politician"  # Elected officials, diplomats
+    ACADEMIC = "academic"  # Professors, researchers
+    ARTIST = "artist"  # Musicians, actors, directors, writers
+    ATHLETE = "athlete"  # Sports figures
+    ENTREPRENEUR = "entrepreneur"  # Founders, business owners
+    JOURNALIST = "journalist"  # Reporters, media personalities
+    ACTIVIST = "activist"  # Advocates, campaigners
+    SCIENTIST = "scientist"  # Scientists, inventors
+    UNKNOWN = "unknown"  # Type not determined
+class CompanyRecord(BaseModel):
+    """
+    An organization record for the embedding database.
+    Used for storing and searching organizations by embedding similarity.
+    Note: Class name kept as CompanyRecord for API compatibility.
+    """
+    name: str = Field(..., description="Organization name (used for embedding and display)")
+    source: SourceType = Field(..., description="Data source")
+    source_id: str = Field(..., description="Unique identifier from source (LEI, CIK, CH number)")
+    region: str = Field(default="", description="Geographic region/country (e.g., 'UK', 'US', 'DE')")
+    entity_type: EntityType = Field(default=EntityType.UNKNOWN, description="Organization type classification")
+    record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
+    @property
+    def canonical_id(self) -> str:
+        """Generate canonical ID in format source:source_id."""
+        return f"{self.source}:{self.source_id}"
+    def model_dump_for_db(self) -> dict[str, Any]:
+        """Convert to dict suitable for database storage."""
+        return {
+            "name": self.name,
+            "source": self.source,
+            "source_id": self.source_id,
+            "region": self.region,
+            "entity_type": self.entity_type.value,
+            "record": self.record,
+        }
+PersonSourceType = Literal["wikidata"]
+class PersonRecord(BaseModel):
+    """
+    A person record for the embedding database.
+    Used for storing and searching notable people by embedding similarity.
+    Supports people from Wikipedia/Wikidata with role/org context.
+    """
+    name: str = Field(..., description="Display name (used for embedding and display)")
+    source: PersonSourceType = Field(default="wikidata", description="Data source")
+    source_id: str = Field(..., description="Unique identifier from source (Wikidata QID)")
+    country: str = Field(default="", description="Country code or name (e.g., 'US', 'Germany')")
+    person_type: PersonType = Field(default=PersonType.UNKNOWN, description="Person type classification")
+    known_for_role: str = Field(default="", description="Primary role from Wikipedia (e.g., 'CEO', 'President')")
+    known_for_org: str = Field(default="", description="Primary org from Wikipedia (e.g., 'Apple Inc', 'Tesla')")
+    record: dict[str, Any] = Field(default_factory=dict, description="Original record from source")
+    @property
+    def canonical_id(self) -> str:
+        """Generate canonical ID in format source:source_id."""
+        return f"{self.source}:{self.source_id}"
+    def model_dump_for_db(self) -> dict[str, Any]:
+        """Convert to dict suitable for database storage."""
+        return {
+            "name": self.name,
+            "source": self.source,
+            "source_id": self.source_id,
+            "country": self.country,
+            "person_type": self.person_type.value,
+            "known_for_role": self.known_for_role,
+            "known_for_org": self.known_for_org,
+            "record": self.record,
+        }
+    def get_embedding_text(self) -> str:
+        """Build text for embedding that includes role/org context."""
+        parts = [self.name]
+        if self.known_for_role:
+            parts.append(self.known_for_role)
+        if self.known_for_org:
+            parts.append(self.known_for_org)
+        return " | ".join(parts)
+class PersonMatch(BaseModel):
+    """
+    A person match result from embedding search.
+    Returned by the person qualifier when finding potential matches.
+    """
+    query_name: str = Field(..., description="Name extracted from text (the search query)")
+    record: PersonRecord = Field(..., description="The matched person record")
+    source: PersonSourceType = Field(..., description="Data source of match")
+    source_id: str = Field(..., description="Source identifier of match")
+    canonical_id: str = Field(..., description="Canonical ID in format source:source_id")
+    similarity_score: float = Field(..., description="Embedding similarity score (0-1)")
+    llm_confirmed: bool = Field(default=False, description="Whether LLM confirmed this match")
+    @property
+    def name(self) -> str:
+        """Get the matched person name."""
+        return self.record.name
+    @classmethod
+    def from_record(
+        cls,
+        query_name: str,
+        record: PersonRecord,
+        similarity_score: float,
+        llm_confirmed: bool = False,
+    ) -> "PersonMatch":
+        """Create a PersonMatch from a person record."""
+        return cls(
+            query_name=query_name,
+            record=record,
+            source=record.source,
+            source_id=record.source_id,
+            canonical_id=record.canonical_id,
+            similarity_score=similarity_score,
+            llm_confirmed=llm_confirmed,
+        )
+class CompanyMatch(BaseModel):
+    """
+    An organization match result from embedding search.
+    Returned by the organization qualifier when finding potential matches.
+    Note: Class name kept as CompanyMatch for API compatibility.
+    """
+    query_name: str = Field(..., description="Name extracted from text (the search query)")
+    record: CompanyRecord = Field(..., description="The matched organization record")
+    source: SourceType = Field(..., description="Data source of match")
+    source_id: str = Field(..., description="Source identifier of match")
+    canonical_id: str = Field(..., description="Canonical ID in format source:source_id")
+    similarity_score: float = Field(..., description="Embedding similarity score (0-1)")
+    llm_confirmed: bool = Field(default=False, description="Whether LLM confirmed this match")
+    @property
+    def name(self) -> str:
+        """Get the matched organization name."""
+        return self.record.name
+    @classmethod
+    def from_record(
+        cls,
+        query_name: str,
+        record: CompanyRecord,
+        similarity_score: float,
+        llm_confirmed: bool = False,
+    ) -> "CompanyMatch":
+        """Create a CompanyMatch from an organization record."""
+        return cls(
+            query_name=query_name,
+            record=record,
+            source=record.source,
+            source_id=record.source_id,
+            canonical_id=record.canonical_id,
+            similarity_score=similarity_score,
+            llm_confirmed=llm_confirmed,
+        )
+class DatabaseStats(BaseModel):
+    """Statistics about the organization database."""
+    total_records: int = 0
+    by_source: dict[str, int] = Field(default_factory=dict)
+    embedding_dimension: int = 0
+    database_size_bytes: int = 0

statement_extractor/database/resolver.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""
+Entity resolver utilities for database lookups.
+Provides shared functionality for resolving entity names against
+the organization and person databases.
+"""
+import logging
+from typing import Optional
+from .models import CompanyRecord
+from ..models import ResolvedOrganization
+logger = logging.getLogger(__name__)
+# Source prefix mapping for canonical IDs
+SOURCE_PREFIX_MAP = {
+    "gleif": "LEI",
+    "sec_edgar": "SEC-CIK",
+    "companies_house": "UK-CH",
+    "wikidata": "WIKIDATA",
+    "wikipedia": "WIKIDATA",
+}
+def get_source_prefix(source: str) -> str:
+    """Get the canonical ID prefix for a data source."""
+    return SOURCE_PREFIX_MAP.get(source, source.upper())
+class OrganizationResolver:
+    """
+    Resolves organization names against the organization database.
+    Shared utility that can be used by both EmbeddingCompanyQualifier
+    and PersonQualifierPlugin for resolving organization references.
+    """
+    def __init__(
+        self,
+        db_path: Optional[str] = None,
+        top_k: int = 5,
+        min_similarity: float = 0.7,
+        auto_download_db: bool = True,
+    ):
+        """
+        Initialize the organization resolver.
+        Args:
+            db_path: Path to database (auto-detects if None)
+            top_k: Number of candidates to retrieve
+            min_similarity: Minimum similarity threshold
+            auto_download_db: Whether to auto-download database
+        """
+        self._db_path = db_path
+        self._top_k = top_k
+        self._min_similarity = min_similarity
+        self._auto_download_db = auto_download_db
+        # Lazy-loaded components
+        self._database = None
+        self._embedder = None
+        self._cache: dict[str, Optional[ResolvedOrganization]] = {}
+    def _get_database(self):
+        """Get or initialize the organization database."""
+        if self._database is not None:
+            return self._database
+        try:
+            from .store import get_database
+            from .hub import get_database_path
+            db_path = self._db_path
+            if db_path is None:
+                db_path = get_database_path(auto_download=self._auto_download_db)
+            if db_path is None:
+                logger.warning("Organization database not available.")
+                return None
+            self._database = get_database(db_path=db_path)
+            return self._database
+        except Exception as e:
+            logger.warning(f"Failed to load organization database: {e}")
+            return None
+    def _get_embedder(self):
+        """Get or initialize the embedder."""
+        if self._embedder is not None:
+            return self._embedder
+        try:
+            from .embeddings import CompanyEmbedder
+            self._embedder = CompanyEmbedder()
+            return self._embedder
+        except Exception as e:
+            logger.warning(f"Failed to load embedder: {e}")
+            return None
+    def resolve(self, org_name: str, use_cache: bool = True) -> Optional[ResolvedOrganization]:
+        """
+        Resolve an organization name against the database.
+        Args:
+            org_name: Organization name to resolve
+            use_cache: Whether to use cached results
+        Returns:
+            ResolvedOrganization if found, None otherwise
+        """
+        if not org_name:
+            return None
+        # Check cache
+        cache_key = org_name.lower().strip()
+        if use_cache and cache_key in self._cache:
+            return self._cache[cache_key]
+        database = self._get_database()
+        if database is None:
+            return None
+        embedder = self._get_embedder()
+        if embedder is None:
+            return None
+        try:
+            # Embed the org name
+            query_embedding = embedder.embed(org_name)
+            # Search with text pre-filtering
+            results = database.search(
+                query_embedding,
+                top_k=self._top_k,
+                query_text=org_name,
+            )
+            # Filter by similarity threshold
+            results = [(r, s) for r, s in results if s >= self._min_similarity]
+            if not results:
+                if use_cache:
+                    self._cache[cache_key] = None
+                return None
+            # Take the best match
+            record, similarity = results[0]
+            resolved = self._build_resolved_organization(record, similarity)
+            if use_cache:
+                self._cache[cache_key] = resolved
+            return resolved
+        except Exception as e:
+            logger.debug(f"Failed to resolve organization '{org_name}': {e}")
+            if use_cache:
+                self._cache[cache_key] = None
+            return None
+    def resolve_with_candidates(
+        self,
+        org_name: str,
+        top_k: Optional[int] = None,
+    ) -> list[tuple[CompanyRecord, float]]:
+        """
+        Get organization candidates with similarity scores.
+        Args:
+            org_name: Organization name to search
+            top_k: Number of candidates (uses instance default if None)
+        Returns:
+            List of (CompanyRecord, similarity) tuples
+        """
+        if not org_name:
+            return []
+        database = self._get_database()
+        if database is None:
+            return []
+        embedder = self._get_embedder()
+        if embedder is None:
+            return []
+        try:
+            query_embedding = embedder.embed(org_name)
+            results = database.search(
+                query_embedding,
+                top_k=top_k or self._top_k,
+                query_text=org_name,
+            )
+            return [(r, s) for r, s in results if s >= self._min_similarity]
+        except Exception as e:
+            logger.debug(f"Failed to search for organization '{org_name}': {e}")
+            return []
+    def _build_resolved_organization(
+        self,
+        record: CompanyRecord,
+        similarity: float,
+    ) -> ResolvedOrganization:
+        """Build ResolvedOrganization from a database record."""
+        source_prefix = get_source_prefix(record.source)
+        return ResolvedOrganization(
+            canonical_name=record.name,
+            canonical_id=f"{source_prefix}:{record.source_id}",
+            source=record.source,
+            source_id=record.source_id,
+            region=record.region or None,
+            match_confidence=min(max(similarity, 0.0), 1.0),
+            match_details={"similarity": similarity},
+        )
+# Singleton instance for shared use
+_default_resolver: Optional[OrganizationResolver] = None
+def get_organization_resolver(
+    db_path: Optional[str] = None,
+    auto_download_db: bool = True,
+) -> OrganizationResolver:
+    """
+    Get or create a shared OrganizationResolver instance.
+    Args:
+        db_path: Path to database
+        auto_download_db: Whether to auto-download database
+    Returns:
+        OrganizationResolver instance
+    """
+    global _default_resolver
+    if _default_resolver is None:
+        _default_resolver = OrganizationResolver(
+            db_path=db_path,
+            auto_download_db=auto_download_db,
+        )
+    return _default_resolver

corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl