PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
corp_extractor-0.9.3.dist-info/RECORD +79 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +2030 -24
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +428 -0
statement_extractor/database/importers/__init__.py +32 -0
statement_extractor/database/importers/companies_house.py +559 -0
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +561 -0
statement_extractor/database/importers/sec_edgar.py +392 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +1120 -0
statement_extractor/database/importers/wikidata_dump.py +1951 -0
statement_extractor/database/importers/wikidata_people.py +1130 -0
statement_extractor/database/models.py +254 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +3034 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +171 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +19 -3
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +39 -15
statement_extractor/models.py +1 -1
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +90 -121
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +348 -78
statement_extractor/plugins/extractors/gliner2.py +38 -28
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +588 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +176 -75
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/extractors/gliner2.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """
-GLiNER2Extractor - Stage 2 plugin that refines triples using GLiNER2.
+GLiNER2Extractor - Stage 2 plugin that extracts triples from sentences.
 Uses GLiNER2 for:
-1. Entity extraction: Refine subject/object boundaries
-2. Relation extraction: When predicate list is provided
+1. Entity extraction: Identify subject/object entities with types
+2. Relation extraction: Extract predicates using predicate list
 3. Entity scoring: Score how entity-like subjects/objects are
 4. Classification: Run labeler classification schemas in single pass
 """
@@ -16,7 +16,7 @@ from typing import Optional
 from ..base import BaseExtractorPlugin, ClassificationSchema, PluginCapability
 from ...pipeline.context import PipelineContext
 from ...pipeline.registry import PluginRegistry
-from ...models import RawTriple, PipelineStatement, ExtractedEntity, EntityType
+from ...models import SplitSentence, PipelineStatement, ExtractedEntity, EntityType
 logger = logging.getLogger(__name__)
@@ -110,11 +110,11 @@ GLINER_TYPE_MAP = {
 @PluginRegistry.extractor
 class GLiNER2Extractor(BaseExtractorPlugin):
     """
-    Extractor plugin that uses GLiNER2 for entity and relation refinement.
+    Extractor plugin that uses GLiNER2 for entity and relation extraction.
-    Processes raw triples from Stage 1 and produces PipelineStatement
-    objects with typed entities. Also runs classification schemas from
-    labeler plugins in a single pass.
+    Processes split sentences from Stage 1 and produces PipelineStatement
+    objects with subject-predicate-object triples and typed entities.
+    Also runs classification schemas from labeler plugins in a single pass.
     """
     def __init__(
@@ -180,6 +180,16 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def description(self) -> str:
         return "GLiNER2 model for entity and relation extraction"
+    @property
+    def model_vram_gb(self) -> float:
+        """GLiNER2 model weights ~0.8GB."""
+        return 0.8
+    @property
+    def per_item_vram_gb(self) -> float:
+        """Each triple during batch processing ~0.1GB."""
+        return 0.1
     def _get_model(self):
         """Lazy-load the GLiNER2 model."""
         if self._model is None:
@@ -199,36 +209,36 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def extract(
         self,
-        raw_triples: list[RawTriple],
+        split_sentences: list[SplitSentence],
         context: PipelineContext,
     ) -> list[PipelineStatement]:
         """
-        Extract statements from raw triples using GLiNER2.
+        Extract subject-predicate-object triples from split sentences using GLiNER2.
         Returns ALL matching relations from GLiNER2 (not just the best one).
         Also runs any classification schemas and stores results in context.
         Args:
-            raw_triples: Raw triples from Stage 1
+            split_sentences: Atomic sentences from Stage 1
             context: Pipeline context
         Returns:
-            List of PipelineStatement objects (may contain multiple per raw triple)
+            List of PipelineStatement objects (may contain multiple per sentence)
         """
         predicate_categories = self._get_predicate_categories()
-        logger.info(f"GLiNER2Extractor processing {len(raw_triples)} triples")
+        logger.info(f"GLiNER2Extractor processing {len(split_sentences)} sentences")
         logger.info(f"Using {len(predicate_categories)} predicate categories")
         statements = []
         model = self._get_model()
         classified_texts: set[str] = set()
-        for raw in raw_triples:
+        for sentence in split_sentences:
             try:
                 if model:
                     # Use relation extraction iterating through categories
                     # Returns ALL matches, not just the best one
-                    extracted_stmts = self._extract_with_relations(raw, model, predicate_categories)
+                    extracted_stmts = self._extract_with_relations(sentence, model, predicate_categories)
                 else:
                     # No model available - skip
                     logger.warning("No GLiNER2 model available - skipping extraction")
@@ -243,10 +253,10 @@ class GLiNER2Extractor(BaseExtractorPlugin):
                         classified_texts.add(stmt.source_text)
             except Exception as e:
-                logger.warning(f"Error extracting triple: {e}")
-                # No fallback - skip this triple
+                logger.warning(f"Error extracting from sentence: {e}")
+                # No fallback - skip this sentence
-        logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(raw_triples)} raw triples")
+        logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(split_sentences)} sentences")
         return statements
     def _run_classifications(
@@ -306,7 +316,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def _extract_with_relations(
         self,
-        raw: RawTriple,
+        sentence: SplitSentence,
         model,
         predicate_categories: dict[str, dict[str, PredicateConfig]],
     ) -> list[PipelineStatement]:
@@ -318,14 +328,14 @@ class GLiNER2Extractor(BaseExtractorPlugin):
         Returns ALL matching relations, not just the best one.
         Args:
-            raw: Raw triple from Stage 1
+            sentence: Split sentence from Stage 1
             model: GLiNER2 model instance
             predicate_categories: Dict of category -> predicates to use
         Returns:
             List of PipelineStatements for all relations found
         """
-        logger.debug(f"Attempting relation extraction for: '{raw.source_sentence[:80]}...'")
+        logger.debug(f"Attempting relation extraction for: '{sentence.text[:80]}...'")
         # Iterate through each category separately to stay under GLiNER2's ~25 label limit
         # Use schema API with entities + relations together for better extraction
@@ -345,7 +355,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
                     .entities(self._get_entity_types())
                     .relations(relations_dict)
                 )
-                result = model.extract(raw.source_sentence, schema, include_confidence=True)
+                result = model.extract(sentence.text, schema, include_confidence=True)
                 # Get relations from this category
                 relation_data = result.get("relations", result.get("relation_extraction", {}))
@@ -369,7 +379,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
         logger.debug(f"  GLiNER2 found {total_found} total relations across all categories")
         if not all_relations:
-            logger.debug(f"No GLiNER2 relation match in: '{raw.source_sentence[:60]}...'")
+            logger.debug(f"No GLiNER2 relation match in: '{sentence.text[:60]}...'")
             return []
         # Filter by confidence threshold and sort descending
@@ -392,8 +402,8 @@ class GLiNER2Extractor(BaseExtractorPlugin):
             )
             # Get entity types
-            subj_type = self._infer_entity_type(head, model, raw.source_sentence)
-            obj_type = self._infer_entity_type(tail, model, raw.source_sentence)
+            subj_type = self._infer_entity_type(head, model, sentence.text)
+            obj_type = self._infer_entity_type(tail, model, sentence.text)
             logger.debug(f"  Entity types: {subj_type.value}, {obj_type.value}")
             stmt = PipelineStatement(
@@ -409,7 +419,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
                     type=obj_type,
                     confidence=confidence,
                 ),
-                source_text=raw.source_sentence,
+                source_text=sentence.text,
                 confidence_score=confidence,
                 extraction_method="gliner_relation",
             )
@@ -419,7 +429,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def _extract_with_entities(
         self,
-        raw: RawTriple,
+        sentence: SplitSentence,
         model,
     ) -> Optional[PipelineStatement]:
         """
@@ -428,7 +438,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
         This method is called when predicates are disabled. Without GLiNER2 relation
         extraction, we cannot form valid statements.
         """
-        logger.debug(f"Entity extraction mode (no predicates) - skipping: '{raw.source_sentence[:60]}...'")
+        logger.debug(f"Entity extraction mode (no predicates) - skipping: '{sentence.text[:60]}...'")
         return None
     def _parse_relation(self, rel) -> tuple[str, str, float]:

statement_extractor/plugins/labelers/taxonomy.py CHANGED Viewed

@@ -8,9 +8,19 @@ there are too many possible values for simple multi-choice classification.
 import json
 import logging
 from pathlib import Path
-from typing import Optional
+from typing import Optional, TypedDict
 from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
+class TaxonomyEntry(TypedDict):
+    """Structure for each taxonomy label entry."""
+    description: str
+    id: int
+    mnli_label: str
+    embedding_label: str
 from ...pipeline.context import PipelineContext
 from ...models import (
     PipelineStatement,
@@ -214,7 +224,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
         self._top_k_categories = top_k_categories
         self._min_confidence = min_confidence
-        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
         self._classifier: Optional[TaxonomyClassifier] = None
     @property
@@ -250,7 +260,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
             scope="statement",
         )
-    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+    def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
         """Load taxonomy from JSON file."""
         if self._taxonomy is not None:
             return self._taxonomy
@@ -358,12 +368,15 @@ class TaxonomyLabeler(BaseLabelerPlugin):
         taxonomy = self._load_taxonomy()
         if category and category in taxonomy:
-            return taxonomy[category].get(label)
+            entry = taxonomy[category].get(label)
+            if entry:
+                return entry.get("id")
         # Search all categories for flat classification
         for cat_labels in taxonomy.values():
             if label in cat_labels:
-                return cat_labels[label]
+                entry = cat_labels[label]
+                return entry.get("id")
         return None

statement_extractor/plugins/labelers/taxonomy_embedding.py CHANGED Viewed

@@ -11,10 +11,19 @@ import json
 import logging
 import time
 from pathlib import Path
-from typing import Optional
+from typing import Optional, TypedDict
 import numpy as np
+class TaxonomyEntry(TypedDict):
+    """Structure for each taxonomy label entry."""
+    description: str
+    id: int
+    mnli_label: str
+    embedding_label: str
 from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
 from ...pipeline.context import PipelineContext
 from ...models import (
@@ -106,14 +115,14 @@ class EmbeddingClassifier:
     def precompute_label_embeddings(
         self,
-        taxonomy: dict[str, dict[str, int]],
+        taxonomy: dict[str, dict[str, TaxonomyEntry]],
         categories: Optional[list[str]] = None,
     ) -> None:
         """
         Pre-compute embeddings for all label names.
         Args:
-            taxonomy: Taxonomy dict {category: {label: id, ...}, ...}
+            taxonomy: Taxonomy dict {category: {label: TaxonomyEntry, ...}, ...}
             categories: Categories to include (default: all)
         """
         self._load_model()
@@ -314,7 +323,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
         self._top_k_categories = top_k_categories
         self._min_confidence = min_confidence
-        self._taxonomy: Optional[dict[str, dict[str, int]]] = None
+        self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
         self._classifier: Optional[EmbeddingClassifier] = None
         self._embeddings_computed = False
@@ -350,7 +359,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
             scope="statement",
         )
-    def _load_taxonomy(self) -> dict[str, dict[str, int]]:
+    def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
         """Load taxonomy from JSON file."""
         if self._taxonomy is not None:
             return self._taxonomy
@@ -456,7 +465,9 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
         taxonomy = self._load_taxonomy()
         if category in taxonomy:
-            return taxonomy[category].get(label)
+            entry = taxonomy[category].get(label)
+            if entry:
+                return entry.get("id")
         return None

statement_extractor/plugins/pdf/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+PDF parser plugins for extracting text from PDF files.
+Built-in parsers:
+- pypdf_parser: Default PDF parser using PyMuPDF with optional OCR
+"""
+from .pypdf import PyPDFParserPlugin
+__all__ = ["PyPDFParserPlugin"]

statement_extractor/plugins/pdf/pypdf.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""
+PDF parser plugin using PyMuPDF (fitz) with optional OCR fallback.
+Extracts text from PDFs page by page, with automatic detection of
+image-heavy PDFs that may require OCR.
+"""
+import io
+import logging
+import os
+import tempfile
+from typing import Any, Optional
+from ..base import BasePDFParserPlugin, PDFParseResult
+from ...pipeline.registry import PluginRegistry
+logger = logging.getLogger(__name__)
+@PluginRegistry.pdf_parser
+class PyPDFParserPlugin(BasePDFParserPlugin):
+    """
+    PDF parser using PyMuPDF (fitz) with optional OCR fallback.
+    Features:
+    - Fast text extraction using PyMuPDF
+    - Automatic detection of image-heavy PDFs
+    - Optional OCR fallback using Tesseract
+    - Metadata extraction (title, author, etc.)
+    """
+    def __init__(
+        self,
+        image_threshold: float = 0.5,
+        text_threshold: float = 0.4,
+        use_ocr_fallback: bool = True,
+    ):
+        """
+        Initialize the PDF parser.
+        Args:
+            image_threshold: Images per page threshold for OCR trigger
+            text_threshold: Text density threshold (chars/1000 per page)
+            use_ocr_fallback: Enable automatic OCR for image-heavy PDFs
+        """
+        self._image_threshold = image_threshold
+        self._text_threshold = text_threshold
+        self._use_ocr_fallback = use_ocr_fallback
+    @property
+    def name(self) -> str:
+        return "pypdf_parser"
+    @property
+    def priority(self) -> int:
+        return 100
+    @property
+    def description(self) -> str:
+        return "PDF parser using PyMuPDF with optional OCR fallback"
+    @property
+    def supports_ocr(self) -> bool:
+        return self._use_ocr_fallback
+    def parse(
+        self,
+        pdf_bytes: bytes,
+        max_pages: int = 500,
+        use_ocr: bool = False,
+    ) -> PDFParseResult:
+        """
+        Extract text from PDF bytes.
+        Args:
+            pdf_bytes: Raw PDF file content
+            max_pages: Maximum number of pages to process
+            use_ocr: Force OCR even for text-extractable PDFs
+        Returns:
+            PDFParseResult with extracted text for each page
+        """
+        try:
+            import fitz  # PyMuPDF
+        except ImportError:
+            return PDFParseResult(
+                pages=[],
+                page_count=0,
+                error="PyMuPDF (fitz) not installed. Install with: pip install PyMuPDF",
+            )
+        temp_path: Optional[str] = None
+        try:
+            # Write bytes to temp file for fitz
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
+                f.write(pdf_bytes)
+                temp_path = f.name
+            logger.info(f"Parsing PDF: {len(pdf_bytes)} bytes")
+            # Open the PDF
+            pdf_doc = fitz.open(temp_path)
+            total_pages = len(pdf_doc)
+            logger.info(f"PDF has {total_pages} pages")
+            # Check if we should use OCR
+            should_ocr = use_ocr or (
+                self._use_ocr_fallback and self._is_mostly_images(pdf_doc)
+            )
+            if should_ocr:
+                logger.info("PDF appears image-heavy, using OCR")
+                result = self._parse_with_ocr(pdf_doc, max_pages)
+            else:
+                logger.info("PDF has extractable text, using direct extraction")
+                result = self._parse_with_fitz(pdf_doc, max_pages)
+            pdf_doc.close()
+            return result
+        except Exception as e:
+            logger.exception(f"Error parsing PDF: {e}")
+            return PDFParseResult(
+                pages=[],
+                page_count=0,
+                error=f"Failed to parse PDF: {e}",
+            )
+        finally:
+            # Clean up temp file
+            if temp_path and os.path.exists(temp_path):
+                try:
+                    os.unlink(temp_path)
+                except Exception:
+                    pass
+    def _is_mostly_images(self, pdf_doc) -> bool:
+        """
+        Check if PDF is mostly images (may need OCR).
+        Args:
+            pdf_doc: PyMuPDF document object
+        Returns:
+            True if PDF appears to be image-heavy
+        """
+        total_pages = len(pdf_doc)
+        if total_pages == 0:
+            return False
+        # Count images in first few pages
+        sample_pages = min(3, total_pages)
+        image_count = 0
+        for i in range(sample_pages):
+            image_count += len(pdf_doc[i].get_images())
+        avg_images_per_page = image_count / sample_pages
+        # Check text density in sample pages
+        sample_text = ""
+        for i in range(sample_pages):
+            sample_text += pdf_doc[i].get_text()
+        text_density = len(sample_text) / 1000 / sample_pages
+        logger.debug(
+            f"PDF analysis: {avg_images_per_page:.1f} images/page, "
+            f"{text_density:.2f} text density"
+        )
+        # If text density is high, don't use OCR
+        if text_density > self._text_threshold:
+            return False
+        # If many images per page and low text, probably needs OCR
+        return avg_images_per_page > self._image_threshold
+    def _parse_with_fitz(self, pdf_doc, max_pages: int) -> PDFParseResult:
+        """
+        Extract text using PyMuPDF (fast, direct extraction).
+        Args:
+            pdf_doc: PyMuPDF document object
+            max_pages: Maximum pages to process
+        Returns:
+            PDFParseResult with extracted text
+        """
+        pages = []
+        total_pages = len(pdf_doc)
+        for i in range(min(total_pages, max_pages)):
+            page = pdf_doc[i]
+            text = page.get_text()
+            pages.append(text.strip())
+            if (i + 1) % 50 == 0:
+                logger.debug(f"Processed {i + 1}/{min(total_pages, max_pages)} pages")
+        # Extract metadata
+        metadata = self._extract_metadata(pdf_doc)
+        return PDFParseResult(
+            pages=pages,
+            page_count=total_pages,
+            metadata=metadata,
+        )
+    def _parse_with_ocr(self, pdf_doc, max_pages: int) -> PDFParseResult:
+        """
+        Extract text using OCR (Tesseract).
+        Args:
+            pdf_doc: PyMuPDF document object
+            max_pages: Maximum pages to process
+        Returns:
+            PDFParseResult with OCR-extracted text
+        """
+        try:
+            import pytesseract
+            from PIL import Image
+        except ImportError:
+            return PDFParseResult(
+                pages=[],
+                page_count=len(pdf_doc),
+                error="OCR dependencies not installed. Install with: pip install pytesseract Pillow",
+            )
+        pages = []
+        total_pages = len(pdf_doc)
+        for i in range(min(total_pages, max_pages)):
+            page = pdf_doc[i]
+            # Render page to image
+            pix = page.get_pixmap(dpi=150)  # 150 DPI is good balance
+            img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+            # Run OCR
+            text = pytesseract.image_to_string(img)
+            pages.append(text.strip())
+            if (i + 1) % 10 == 0:
+                logger.debug(f"OCR processed {i + 1}/{min(total_pages, max_pages)} pages")
+        # Extract metadata
+        metadata = self._extract_metadata(pdf_doc)
+        return PDFParseResult(
+            pages=pages,
+            page_count=total_pages,
+            metadata=metadata,
+        )
+    @staticmethod
+    def _extract_metadata(pdf_doc) -> dict[str, Any]:
+        """
+        Extract PDF metadata.
+        Args:
+            pdf_doc: PyMuPDF document object
+        Returns:
+            Dictionary of metadata fields
+        """
+        metadata = {}
+        try:
+            doc_metadata = pdf_doc.metadata
+            if doc_metadata:
+                # Map common PDF metadata fields
+                field_map = {
+                    "title": "title",
+                    "author": "author",
+                    "subject": "subject",
+                    "keywords": "keywords",
+                    "creator": "creator",
+                    "producer": "producer",
+                    "creationDate": "created",
+                    "modDate": "modified",
+                }
+                for pdf_key, our_key in field_map.items():
+                    value = doc_metadata.get(pdf_key)
+                    if value and isinstance(value, str) and value.strip():
+                        metadata[our_key] = value.strip()
+        except Exception as e:
+            logger.debug(f"Error extracting metadata: {e}")
+        return metadata

statement_extractor/plugins/qualifiers/__init__.py CHANGED Viewed

@@ -6,6 +6,15 @@ Adds qualifiers and identifiers to entities.
 from .base import BaseQualifierPlugin
 from .person import PersonQualifierPlugin
+# Import embedding qualifier (may fail if database module not available)
+try:
+    from .embedding_company import EmbeddingCompanyQualifier
+except ImportError:
+    EmbeddingCompanyQualifier = None  # type: ignore
+# DEPRECATED: These API-based qualifiers are deprecated in favor of EmbeddingCompanyQualifier
+# They are no longer auto-registered with the plugin registry.
 from .gleif import GLEIFQualifierPlugin
 from .companies_house import CompaniesHouseQualifierPlugin
 from .sec_edgar import SECEdgarQualifierPlugin
@@ -13,6 +22,8 @@ from .sec_edgar import SECEdgarQualifierPlugin
 __all__ = [
     "BaseQualifierPlugin",
     "PersonQualifierPlugin",
+    "EmbeddingCompanyQualifier",
+    # Deprecated - kept for backwards compatibility
     "GLEIFQualifierPlugin",
     "CompaniesHouseQualifierPlugin",
     "SECEdgarQualifierPlugin",

corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl