PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +1227 -10
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/models/__init__.py +16 -1
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +26 -0
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/orchestrator.py +80 -111
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +334 -64
statement_extractor/plugins/extractors/gliner2.py +10 -0
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +578 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +158 -53
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/pdf/pypdf.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""
+PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
+Extracts text from PDFs page by page, with automatic detection of
+image-heavy PDFs that may require OCR.
+"""
+import io
+import logging
+import os
+import tempfile
+from typing import Any, Optional
+from ..base import BasePDFParserPlugin, PDFParseResult
+from ...pipeline.registry import PluginRegistry
+logger = logging.getLogger(__name__)
+@PluginRegistry.pdf_parser
+class PyPDFParserPlugin(BasePDFParserPlugin):
+    """
+    PDF parser using PyMuPDF (fitz) with optional OCR fallback.
+    Features:
+    - Fast text extraction using PyMuPDF
+    - Automatic detection of image-heavy PDFs
+    - Optional OCR fallback using Tesseract
+    - Metadata extraction (title, author, etc.)
+    """
+    def __init__(
+        self,
+        image_threshold: float = 0.5,
+        text_threshold: float = 0.4,
+        use_ocr_fallback: bool = True,
+    ):
+        """
+        Initialize the PDF parser.
+        Args:
+            image_threshold: Images per page threshold for OCR trigger
+            text_threshold: Text density threshold (chars/1000 per page)
+            use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
+        """
+        self._image_threshold = image_threshold
+        self._text_threshold = text_threshold
+        self._use_ocr_fallback = use_ocr_fallback
+    @property
+    def name(self) -> str:
+        return "pypdf_parser"
+    @property
+    def priority(self) -> int:
+        return 100
+    @property
+    def description(self) -> str:
+        return "PDF parser using PyMuPDF with optional OCR fallback"
+    @property
+    def supports_ocr(self) -> bool:
+        return self._use_ocr_fallback
+    def parse(
+        self,
+        pdf_bytes: bytes,
+        max_pages: int = 500,
+        use_ocr: bool = False,
+    ) -> PDFParseResult:
+        """
+        Extract text from PDF bytes.
+        Args:
+            pdf_bytes: Raw PDF file content
+            max_pages: Maximum number of pages to process
+            use_ocr: Force OCR even for text-extractable PDFs
+        Returns:
+            PDFParseResult with extracted text for each page
+        """
+        try:
+            import fitz  # PyMuPDF
+        except ImportError:
+            return PDFParseResult(
+                pages=[],
+                page_count=0,
+                error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
+            )
+        temp_path: Optional[str] = None
+        try:
+            # Write bytes to temp file for fitz
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
+                f.write(pdf_bytes)
+                temp_path = f.name
+            logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
+            # Open the PDF
+            pdf_doc = fitz.open(temp_path)
+            total_pages = len(pdf_doc)
+            logger.info(f"PDF has {total_pages} pages")
+            # Check if we should use OCR
+            should_ocr = use_ocr or (
+                self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
+            )
+            if should_ocr:
+                logger.info("PDF appears image-heavy, using OCR")
+                result = self._parse_with_ocr(pdf_doc, max_pages)
+            else:
+                logger.info("PDF has extractable text, using direct extraction")
+                result = self._parse_with_fitz(pdf_doc, max_pages)
+            pdf_doc.close()
+            return result
+        except Exception as e:
+            logger.exception(f"Error parsing PDF: {e}")
+            return PDFParseResult(
+                pages=[],
+                page_count=0,
+                error=f"Failed to parse PDF: {e}",
+            )
+        finally:
+            # Clean up temp file
+            if temp_path and os.path.exists(temp_path):
+                try:
+                    os.unlink(temp_path)
+                except Exception:
+                    pass
+    def _is_mostly_images(self, pdf_doc) -> bool:
+        """
+        Check if PDF is mostly images (may need OCR).
+        Args:
+            pdf_doc: PyMuPDF document object
+        Returns:
+            True if PDF appears to be image-heavy
+        """
+        total_pages = len(pdf_doc)
+        if total_pages == 0:
+            return False
+        # Count images in first few pages
+        sample_pages = min(3, total_pages)
+        image_count = 0
+        for i in range(sample_pages):
+            image_count += len(pdf_doc[i].get_images())
+        avg_images_per_page = image_count / sample_pages
+        # Check text density in sample pages
+        sample_text = ""
+        for i in range(sample_pages):
+            sample_text += pdf_doc[i].get_text()
+        text_density = len(sample_text) / 1000 / sample_pages
+        logger.debug(
+            f"PDF analysis: {avg_images_per_page:.1f} images/page, "
+            f"{text_density:.2f} text density"
+        )
+        # If text density is high, don't use OCR
+        if text_density > self._text_threshold:
+            return False
+        # If many images per page and low text, probably needs OCR
+        return avg_images_per_page > self._image_threshold
+    def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
+        """
+        Extract text using PyMuPDF (fast, direct extraction).
+        Args:
+            pdf_doc: PyMuPDF document object
+            max_pages: Maximum pages to process
+        Returns:
+            PDFParseResult with extracted text
+        """
+        pages = []
+        total_pages = len(pdf_doc)
+        for i in range(min(total_pages, max_pages)):
+            page = pdf_doc[i]
+            text = page.get_text()
+            pages.append(text.strip())
+            if (i + 1) % 50 == 0:
+                logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
+        # Extract metadata
+        metadata = self._extract_metadata(pdf_doc)
+        return PDFParseResult(
+            pages=pages,
+            page_count=total_pages,
+            metadata=metadata,
+        )
+    def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
+        """
+        Extract text using OCR (Tesseract).
+        Args:
+            pdf_doc: PyMuPDF document object
+            max_pages: Maximum pages to process
+        Returns:
+            PDFParseResult with OCR-extracted text
+        """
+        try:
+            import pytesseract
+            from PIL import Image
+        except ImportError:
+            return PDFParseResult(
+                pages=[],
+                page_count=len(pdf_doc),
+                error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
+            )
+        pages = []
+        total_pages = len(pdf_doc)
+        for i in range(min(total_pages, max_pages)):
+            page = pdf_doc[i]
+            # Render page to image
+            pix = page.get_pixmap(dpi=150)  # 150 DPI is good balance
+            img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+            # Run OCR
+            text = pytesseract.image_to_string(img)
+            pages.append(text.strip())
+            if (i + 1) % 10 == 0:
+                logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
+        # Extract metadata
+        metadata = self._extract_metadata(pdf_doc)
+        return PDFParseResult(
+            pages=pages,
+            page_count=total_pages,
+            metadata=metadata,
+        )
+    @staticmethod
+    def _extract_metadata(pdf_doc) -> dict[str, Any]:
+        """
+        Extract PDF metadata.
+        Args:
+            pdf_doc: PyMuPDF document object
+        Returns:
+            Dictionary of metadata fields
+        """
+        metadata = {}
+        try:
+            doc_metadata = pdf_doc.metadata
+            if doc_metadata:
+                # Map common PDF metadata fields
+                field_map = {
+                    "title": "title",
+                    "author": "author",
+                    "subject": "subject",
+                    "keywords": "keywords",
+                    "creator": "creator",
+                    "producer": "producer",
+                    "creationDate": "created",
+                    "modDate": "modified",
+                }
+                for pdf_key, our_key in field_map.items():
+                    value = doc_metadata.get(pdf_key)
+                    if value and isinstance(value, str) and value.strip():
+                        metadata[our_key] = value.strip()
+        except Exception as e:
+            logger.debug(f"Error extracting metadata: {e}")
+        return metadata

statement_extractor/plugins/qualifiers/__init__.py CHANGED Viewed

@@ -6,6 +6,15 @@ Adds qualifiers and identifiers to entities.
 from .base import BaseQualifierPlugin
 from .person import PersonQualifierPlugin
+# Import embedding qualifier (may fail if database module not available)
+try:
+    from .embedding_company import EmbeddingCompanyQualifier
+except ImportError:
+    EmbeddingCompanyQualifier = None  # type: ignore
+# DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
+# They are no longer auto-registered with the plugin registry.
 from .gleif import GLEIFQualifierPlugin
 from .companies_house import CompaniesHouseQualifierPlugin
 from .sec_edgar import SECEdgarQualifierPlugin
@@ -13,6 +22,8 @@ from .sec_edgar import SECEdgarQualifierPlugin
 __all__ = [
     "BaseQualifierPlugin",
     "PersonQualifierPlugin",
+    "EmbeddingCompanyQualifier",
+    # Deprecated - kept for backwards compatibility
     "GLEIFQualifierPlugin",
     "CompaniesHouseQualifierPlugin",
     "SECEdgarQualifierPlugin",

statement_extractor/plugins/qualifiers/companies_house.py CHANGED Viewed

@@ -1,6 +1,9 @@
 """
 CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
+DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
+embedding database with pre-loaded Companies House data for faster, offline matching.
 Uses the UK Companies House API to:
 - Look up company number by name
 - Retrieve company details, jurisdiction, officers
@@ -8,11 +11,11 @@ Uses the UK Companies House API to:
 import logging
 import os
+import warnings
 from typing import Optional
 from ..base import BaseQualifierPlugin, PluginCapability
 from ...pipeline.context import PipelineContext
-from ...pipeline.registry import PluginRegistry
 from ...models import ExtractedEntity, EntityQualifiers, EntityType
 logger = logging.getLogger(__name__)
@@ -21,11 +24,12 @@ logger = logging.getLogger(__name__)
 CH_API_BASE = "https://api.company-information.service.gov.uk"
-@PluginRegistry.qualifier
+# DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
 class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
     """
-    Qualifier plugin for UK ORG entities using Companies House API.
+    DEPRECATED: Use EmbeddingCompanyQualifier instead.
+    Qualifier plugin for UK ORG entities using Companies House API.
     Requires COMPANIES_HOUSE_API_KEY environment variable.
     """
@@ -38,11 +42,18 @@ class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
         """
         Initialize the Companies House qualifier.
+        DEPRECATED: Use EmbeddingCompanyQualifier instead.
         Args:
             api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
             timeout: API request timeout in seconds
             cache_results: Whether to cache API results
         """
+        warnings.warn(
+            "CompaniesHouseQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
         self._timeout = timeout
         self._cache_results = cache_results

corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl