PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +10 -1
statement_extractor/cli.py +1663 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +6972 -0
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +89 -0
statement_extractor/models/canonical.py +182 -0
statement_extractor/models/document.py +308 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +220 -0
statement_extractor/models/qualifiers.py +139 -0
statement_extractor/models/statement.py +101 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +129 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +416 -0
statement_extractor/pipeline/registry.py +303 -0
statement_extractor/plugins/__init__.py +55 -0
statement_extractor/plugins/base.py +716 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +546 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +386 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +30 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +185 -0
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +197 -0
statement_extractor/plugins/qualifiers/person.py +785 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +293 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +484 -0
statement_extractor/plugins/taxonomy/mnli.py +291 -0
statement_extractor/scoring.py +8 -8
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/pdf/pypdf.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""
+PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
+Extracts text from PDFs page by page, with automatic detection of
+image-heavy PDFs that may require OCR.
+"""
+import io
+import logging
+import os
+import tempfile
+from typing import Any, Optional
+from ..base import BasePDFParserPlugin, PDFParseResult
+from ...pipeline.registry import PluginRegistry
+logger = logging.getLogger(__name__)
+@PluginRegistry.pdf_parser
+class PyPDFParserPlugin(BasePDFParserPlugin):
+    """
+    PDF parser using PyMuPDF (fitz) with optional OCR fallback.
+    Features:
+    - Fast text extraction using PyMuPDF
+    - Automatic detection of image-heavy PDFs
+    - Optional OCR fallback using Tesseract
+    - Metadata extraction (title, author, etc.)
+    """
+    def __init__(
+        self,
+        image_threshold: float = 0.5,
+        text_threshold: float = 0.4,
+        use_ocr_fallback: bool = True,
+    ):
+        """
+        Initialize the PDF parser.
+        Args:
+            image_threshold: Images per page threshold for OCR trigger
+            text_threshold: Text density threshold (chars/1000 per page)
+            use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
+        """
+        self._image_threshold = image_threshold
+        self._text_threshold = text_threshold
+        self._use_ocr_fallback = use_ocr_fallback
+    @property
+    def name(self) -> str:
+        return "pypdf_parser"
+    @property
+    def priority(self) -> int:
+        return 100
+    @property
+    def description(self) -> str:
+        return "PDF parser using PyMuPDF with optional OCR fallback"
+    @property
+    def supports_ocr(self) -> bool:
+        return self._use_ocr_fallback
+    def parse(
+        self,
+        pdf_bytes: bytes,
+        max_pages: int = 500,
+        use_ocr: bool = False,
+    ) -> PDFParseResult:
+        """
+        Extract text from PDF bytes.
+        Args:
+            pdf_bytes: Raw PDF file content
+            max_pages: Maximum number of pages to process
+            use_ocr: Force OCR even for text-extractable PDFs
+        Returns:
+            PDFParseResult with extracted text for each page
+        """
+        try:
+            import fitz  # PyMuPDF
+        except ImportError:
+            return PDFParseResult(
+                pages=[],
+                page_count=0,
+                error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
+            )
+        temp_path: Optional[str] = None
+        try:
+            # Write bytes to temp file for fitz
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
+                f.write(pdf_bytes)
+                temp_path = f.name
+            logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
+            # Open the PDF
+            pdf_doc = fitz.open(temp_path)
+            total_pages = len(pdf_doc)
+            logger.info(f"PDF has {total_pages} pages")
+            # Check if we should use OCR
+            should_ocr = use_ocr or (
+                self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
+            )
+            if should_ocr:
+                logger.info("PDF appears image-heavy, using OCR")
+                result = self._parse_with_ocr(pdf_doc, max_pages)
+            else:
+                logger.info("PDF has extractable text, using direct extraction")
+                result = self._parse_with_fitz(pdf_doc, max_pages)
+            pdf_doc.close()
+            return result
+        except Exception as e:
+            logger.exception(f"Error parsing PDF: {e}")
+            return PDFParseResult(
+                pages=[],
+                page_count=0,
+                error=f"Failed to parse PDF: {e}",
+            )
+        finally:
+            # Clean up temp file
+            if temp_path and os.path.exists(temp_path):
+                try:
+                    os.unlink(temp_path)
+                except Exception:
+                    pass
+    def _is_mostly_images(self, pdf_doc) -> bool:
+        """
+        Check if PDF is mostly images (may need OCR).
+        Args:
+            pdf_doc: PyMuPDF document object
+        Returns:
+            True if PDF appears to be image-heavy
+        """
+        total_pages = len(pdf_doc)
+        if total_pages == 0:
+            return False
+        # Count images in first few pages
+        sample_pages = min(3, total_pages)
+        image_count = 0
+        for i in range(sample_pages):
+            image_count += len(pdf_doc[i].get_images())
+        avg_images_per_page = image_count / sample_pages
+        # Check text density in sample pages
+        sample_text = ""
+        for i in range(sample_pages):
+            sample_text += pdf_doc[i].get_text()
+        text_density = len(sample_text) / 1000 / sample_pages
+        logger.debug(
+            f"PDF analysis: {avg_images_per_page:.1f} images/page, "
+            f"{text_density:.2f} text density"
+        )
+        # If text density is high, don't use OCR
+        if text_density > self._text_threshold:
+            return False
+        # If many images per page and low text, probably needs OCR
+        return avg_images_per_page > self._image_threshold
+    def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
+        """
+        Extract text using PyMuPDF (fast, direct extraction).
+        Args:
+            pdf_doc: PyMuPDF document object
+            max_pages: Maximum pages to process
+        Returns:
+            PDFParseResult with extracted text
+        """
+        pages = []
+        total_pages = len(pdf_doc)
+        for i in range(min(total_pages, max_pages)):
+            page = pdf_doc[i]
+            text = page.get_text()
+            pages.append(text.strip())
+            if (i + 1) % 50 == 0:
+                logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
+        # Extract metadata
+        metadata = self._extract_metadata(pdf_doc)
+        return PDFParseResult(
+            pages=pages,
+            page_count=total_pages,
+            metadata=metadata,
+        )
+    def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
+        """
+        Extract text using OCR (Tesseract).
+        Args:
+            pdf_doc: PyMuPDF document object
+            max_pages: Maximum pages to process
+        Returns:
+            PDFParseResult with OCR-extracted text
+        """
+        try:
+            import pytesseract
+            from PIL import Image
+        except ImportError:
+            return PDFParseResult(
+                pages=[],
+                page_count=len(pdf_doc),
+                error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
+            )
+        pages = []
+        total_pages = len(pdf_doc)
+        for i in range(min(total_pages, max_pages)):
+            page = pdf_doc[i]
+            # Render page to image
+            pix = page.get_pixmap(dpi=150)  # 150 DPI is good balance
+            img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+            # Run OCR
+            text = pytesseract.image_to_string(img)
+            pages.append(text.strip())
+            if (i + 1) % 10 == 0:
+                logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
+        # Extract metadata
+        metadata = self._extract_metadata(pdf_doc)
+        return PDFParseResult(
+            pages=pages,
+            page_count=total_pages,
+            metadata=metadata,
+        )
+    @staticmethod
+    def _extract_metadata(pdf_doc) -> dict[str, Any]:
+        """
+        Extract PDF metadata.
+        Args:
+            pdf_doc: PyMuPDF document object
+        Returns:
+            Dictionary of metadata fields
+        """
+        metadata = {}
+        try:
+            doc_metadata = pdf_doc.metadata
+            if doc_metadata:
+                # Map common PDF metadata fields
+                field_map = {
+                    "title": "title",
+                    "author": "author",
+                    "subject": "subject",
+                    "keywords": "keywords",
+                    "creator": "creator",
+                    "producer": "producer",
+                    "creationDate": "created",
+                    "modDate": "modified",
+                }
+                for pdf_key, our_key in field_map.items():
+                    value = doc_metadata.get(pdf_key)
+                    if value and isinstance(value, str) and value.strip():
+                        metadata[our_key] = value.strip()
+        except Exception as e:
+            logger.debug(f"Error extracting metadata: {e}")
+        return metadata

statement_extractor/plugins/qualifiers/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""
+Qualifier plugins for Stage 3 (Qualification).
+Adds qualifiers and identifiers to entities.
+"""
+from .base import BaseQualifierPlugin
+from .person import PersonQualifierPlugin
+# Import embedding qualifier (may fail if database module not available)
+try:
+    from .embedding_company import EmbeddingCompanyQualifier
+except ImportError:
+    EmbeddingCompanyQualifier = None  # type: ignore
+# DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
+# They are no longer auto-registered with the plugin registry.
+from .gleif import GLEIFQualifierPlugin
+from .companies_house import CompaniesHouseQualifierPlugin
+from .sec_edgar import SECEdgarQualifierPlugin
+__all__ = [
+    "BaseQualifierPlugin",
+    "PersonQualifierPlugin",
+    "EmbeddingCompanyQualifier",
+    # Deprecated - kept for backwards compatibility
+    "GLEIFQualifierPlugin",
+    "CompaniesHouseQualifierPlugin",
+    "SECEdgarQualifierPlugin",
+]

statement_extractor/plugins/qualifiers/base.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Base class for qualifier plugins.
+Re-exports BaseQualifierPlugin from the main plugins module.
+"""
+from ..base import BaseQualifierPlugin
+__all__ = ["BaseQualifierPlugin"]

statement_extractor/plugins/qualifiers/companies_house.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""
+CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
+DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
+embedding database with pre-loaded Companies House data for faster, offline matching.
+Uses the UK Companies House API to:
+- Look up company number by name
+- Retrieve company details, jurisdiction, officers
+"""
+import logging
+import os
+import warnings
+from typing import Optional
+from ..base import BaseQualifierPlugin, PluginCapability
+from ...pipeline.context import PipelineContext
+from ...models import ExtractedEntity, EntityQualifiers, EntityType
+logger = logging.getLogger(__name__)
+# Companies House API base URL
+CH_API_BASE = "https://api.company-information.service.gov.uk"
+# DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
+class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
+    """
+    DEPRECATED: Use EmbeddingCompanyQualifier instead.
+    Qualifier plugin for UK ORG entities using Companies House API.
+    Requires COMPANIES_HOUSE_API_KEY environment variable.
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        timeout: int = 10,
+        cache_results: bool = True,
+    ):
+        """
+        Initialize the Companies House qualifier.
+        DEPRECATED: Use EmbeddingCompanyQualifier instead.
+        Args:
+            api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
+            timeout: API request timeout in seconds
+            cache_results: Whether to cache API results
+        """
+        warnings.warn(
+            "CompaniesHouseQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
+        self._timeout = timeout
+        self._cache_results = cache_results
+        self._cache: dict[str, Optional[dict]] = {}
+    @property
+    def name(self) -> str:
+        return "companies_house_qualifier"
+    @property
+    def priority(self) -> int:
+        return 20  # Run after GLEIF
+    @property
+    def capabilities(self) -> PluginCapability:
+        return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
+    @property
+    def description(self) -> str:
+        return "Looks up UK company data from Companies House API"
+    @property
+    def supported_entity_types(self) -> set[EntityType]:
+        return {EntityType.ORG}
+    @property
+    def supported_identifier_types(self) -> list[str]:
+        return ["ch_number"]  # Can lookup by company number
+    @property
+    def provided_identifier_types(self) -> list[str]:
+        return ["ch_number"]  # Provides company number
+    def qualify(
+        self,
+        entity: ExtractedEntity,
+        context: PipelineContext,
+    ) -> Optional[EntityQualifiers]:
+        """
+        Qualify an ORG entity with Companies House data.
+        Args:
+            entity: The ORG entity to qualify
+            context: Pipeline context
+        Returns:
+            EntityQualifiers with company number, or None if not found
+        """
+        if entity.type != EntityType.ORG:
+            return None
+        if not self._api_key:
+            logger.debug("Companies House API key not configured")
+            return None
+        # Check cache first
+        cache_key = entity.text.lower().strip()
+        if self._cache_results and cache_key in self._cache:
+            cached = self._cache[cache_key]
+            if cached is None:
+                return None
+            return self._data_to_qualifiers(cached)
+        # Search Companies House API
+        result = self._search_companies_house(entity.text)
+        # Cache result
+        if self._cache_results:
+            self._cache[cache_key] = result
+        if result:
+            return self._data_to_qualifiers(result)
+        return None
+    def _search_companies_house(self, org_name: str) -> Optional[dict]:
+        """Search Companies House API for organization."""
+        try:
+            import requests
+            from requests.auth import HTTPBasicAuth
+            url = f"{CH_API_BASE}/search/companies"
+            params = {"q": org_name, "items_per_page": 5}
+            response = requests.get(
+                url,
+                params=params,
+                auth=HTTPBasicAuth(self._api_key, ""),
+                timeout=self._timeout,
+            )
+            response.raise_for_status()
+            data = response.json()
+            items = data.get("items", [])
+            if items:
+                # Return first match
+                company = items[0]
+                return {
+                    "ch_number": company.get("company_number", ""),
+                    "title": company.get("title", ""),
+                    "company_status": company.get("company_status", ""),
+                    "company_type": company.get("company_type", ""),
+                    "jurisdiction": "UK",
+                    "country": "GB",
+                    "address": company.get("address_snippet", ""),
+                }
+        except ImportError:
+            logger.warning("requests library not available for Companies House API")
+        except Exception as e:
+            logger.debug(f"Companies House API error: {e}")
+        return None
+    def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
+        """Convert Companies House data to EntityQualifiers."""
+        identifiers = {}
+        if data.get("ch_number"):
+            identifiers["ch_number"] = data["ch_number"]
+        return EntityQualifiers(
+            jurisdiction=data.get("jurisdiction"),
+            country=data.get("country"),
+            identifiers=identifiers,
+        )
+# Allow importing without decorator for testing
+CompaniesHouseQualifierPluginClass = CompaniesHouseQualifierPlugin

corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl