PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +10 -1
statement_extractor/cli.py +1663 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +6972 -0
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +89 -0
statement_extractor/models/canonical.py +182 -0
statement_extractor/models/document.py +308 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +220 -0
statement_extractor/models/qualifiers.py +139 -0
statement_extractor/models/statement.py +101 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +129 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +416 -0
statement_extractor/pipeline/registry.py +303 -0
statement_extractor/plugins/__init__.py +55 -0
statement_extractor/plugins/base.py +716 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +546 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +386 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +30 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +185 -0
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +197 -0
statement_extractor/plugins/qualifiers/person.py +785 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +293 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +484 -0
statement_extractor/plugins/taxonomy/mnli.py +291 -0
statement_extractor/scoring.py +8 -8
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/pipeline/registry.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""
+PluginRegistry - Registration and discovery of plugins.
+Provides a central registry for all plugin types with decorator-based
+registration and discovery by entity type.
+"""
+import logging
+from typing import TYPE_CHECKING, Type, TypeVar
+if TYPE_CHECKING:
+    from ..plugins.base import (
+        BasePlugin,
+        BaseSplitterPlugin,
+        BaseExtractorPlugin,
+        BaseQualifierPlugin,
+        BaseLabelerPlugin,
+        BaseTaxonomyPlugin,
+        BaseScraperPlugin,
+        BasePDFParserPlugin,
+    )
+    from ..models import EntityType
+logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="BasePlugin")
+class PluginRegistry:
+    """
+    Central registry for all pipeline plugins.
+    Supports registration by decorator or explicit method call.
+    Plugins are sorted by priority (lower = higher priority).
+    """
+    # Class-level storage for registered plugins
+    _splitters: list["BaseSplitterPlugin"] = []
+    _extractors: list["BaseExtractorPlugin"] = []
+    _qualifiers: list["BaseQualifierPlugin"] = []
+    _labelers: list["BaseLabelerPlugin"] = []
+    _taxonomy_classifiers: list["BaseTaxonomyPlugin"] = []
+    # Content acquisition plugins
+    _scrapers: list["BaseScraperPlugin"] = []
+    _pdf_parsers: list["BasePDFParserPlugin"] = []
+    # Index by entity type for quick lookup
+    _qualifiers_by_type: dict["EntityType", list["BaseQualifierPlugin"]] = {}
+    # Index by name for CLI lookup
+    _all_plugins: dict[str, "BasePlugin"] = {}
+    @classmethod
+    def clear(cls) -> None:
+        """Clear all registered plugins (useful for testing)."""
+        cls._splitters = []
+        cls._extractors = []
+        cls._qualifiers = []
+        cls._labelers = []
+        cls._taxonomy_classifiers = []
+        cls._scrapers = []
+        cls._pdf_parsers = []
+        cls._qualifiers_by_type = {}
+        cls._all_plugins = {}
+    # =========================================================================
+    # Registration methods
+    # =========================================================================
+    @classmethod
+    def register_splitter(cls, plugin: "BaseSplitterPlugin") -> None:
+        """Register a splitter plugin."""
+        cls._splitters.append(plugin)
+        cls._splitters.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        logger.debug(f"Registered splitter: {plugin.name} (priority={plugin.priority})")
+    @classmethod
+    def register_extractor(cls, plugin: "BaseExtractorPlugin") -> None:
+        """Register an extractor plugin."""
+        cls._extractors.append(plugin)
+        cls._extractors.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        logger.debug(f"Registered extractor: {plugin.name} (priority={plugin.priority})")
+    @classmethod
+    def register_qualifier(cls, plugin: "BaseQualifierPlugin") -> None:
+        """Register a qualifier plugin."""
+        cls._qualifiers.append(plugin)
+        cls._qualifiers.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        # Index by entity type
+        for entity_type in plugin.supported_entity_types:
+            if entity_type not in cls._qualifiers_by_type:
+                cls._qualifiers_by_type[entity_type] = []
+            cls._qualifiers_by_type[entity_type].append(plugin)
+            cls._qualifiers_by_type[entity_type].sort(key=lambda p: p.priority)
+        logger.debug(
+            f"Registered qualifier: {plugin.name} "
+            f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
+        )
+    @classmethod
+    def register_labeler(cls, plugin: "BaseLabelerPlugin") -> None:
+        """Register a labeler plugin."""
+        cls._labelers.append(plugin)
+        cls._labelers.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        logger.debug(f"Registered labeler: {plugin.name} (priority={plugin.priority})")
+    @classmethod
+    def register_taxonomy(cls, plugin: "BaseTaxonomyPlugin") -> None:
+        """Register a taxonomy classifier plugin."""
+        cls._taxonomy_classifiers.append(plugin)
+        cls._taxonomy_classifiers.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        logger.debug(f"Registered taxonomy: {plugin.name} (priority={plugin.priority})")
+    @classmethod
+    def register_scraper(cls, plugin: "BaseScraperPlugin") -> None:
+        """Register a scraper plugin."""
+        cls._scrapers.append(plugin)
+        cls._scrapers.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        logger.debug(f"Registered scraper: {plugin.name} (priority={plugin.priority})")
+    @classmethod
+    def register_pdf_parser(cls, plugin: "BasePDFParserPlugin") -> None:
+        """Register a PDF parser plugin."""
+        cls._pdf_parsers.append(plugin)
+        cls._pdf_parsers.sort(key=lambda p: p.priority)
+        cls._all_plugins[plugin.name] = plugin
+        logger.debug(f"Registered PDF parser: {plugin.name} (priority={plugin.priority})")
+    # =========================================================================
+    # Decorator registration
+    # =========================================================================
+    @classmethod
+    def splitter(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register a splitter plugin class."""
+        cls.register_splitter(plugin_class())
+        return plugin_class
+    @classmethod
+    def extractor(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register an extractor plugin class."""
+        cls.register_extractor(plugin_class())
+        return plugin_class
+    @classmethod
+    def qualifier(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register a qualifier plugin class."""
+        cls.register_qualifier(plugin_class())
+        return plugin_class
+    @classmethod
+    def labeler(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register a labeler plugin class."""
+        cls.register_labeler(plugin_class())
+        return plugin_class
+    @classmethod
+    def taxonomy(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register a taxonomy classifier plugin class."""
+        cls.register_taxonomy(plugin_class())
+        return plugin_class
+    @classmethod
+    def scraper(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register a scraper plugin class."""
+        cls.register_scraper(plugin_class())
+        return plugin_class
+    @classmethod
+    def pdf_parser(cls, plugin_class: Type[T]) -> Type[T]:
+        """Decorator to register a PDF parser plugin class."""
+        cls.register_pdf_parser(plugin_class())
+        return plugin_class
+    # =========================================================================
+    # Retrieval methods
+    # =========================================================================
+    @classmethod
+    def get_splitters(cls) -> list["BaseSplitterPlugin"]:
+        """Get all registered splitter plugins (sorted by priority)."""
+        return cls._splitters.copy()
+    @classmethod
+    def get_extractors(cls) -> list["BaseExtractorPlugin"]:
+        """Get all registered extractor plugins (sorted by priority)."""
+        return cls._extractors.copy()
+    @classmethod
+    def get_qualifiers(cls) -> list["BaseQualifierPlugin"]:
+        """Get all registered qualifier plugins (sorted by priority)."""
+        return cls._qualifiers.copy()
+    @classmethod
+    def get_qualifiers_for_type(cls, entity_type: "EntityType") -> list["BaseQualifierPlugin"]:
+        """Get qualifier plugins that support a specific entity type."""
+        return cls._qualifiers_by_type.get(entity_type, []).copy()
+    @classmethod
+    def get_labelers(cls) -> list["BaseLabelerPlugin"]:
+        """Get all registered labeler plugins (sorted by priority)."""
+        return cls._labelers.copy()
+    @classmethod
+    def get_taxonomy_classifiers(cls) -> list["BaseTaxonomyPlugin"]:
+        """Get all registered taxonomy classifier plugins (sorted by priority)."""
+        return cls._taxonomy_classifiers.copy()
+    @classmethod
+    def get_scrapers(cls) -> list["BaseScraperPlugin"]:
+        """Get all registered scraper plugins (sorted by priority)."""
+        return cls._scrapers.copy()
+    @classmethod
+    def get_pdf_parsers(cls) -> list["BasePDFParserPlugin"]:
+        """Get all registered PDF parser plugins (sorted by priority)."""
+        return cls._pdf_parsers.copy()
+    @classmethod
+    def get_plugin(cls, name: str) -> "BasePlugin | None":
+        """Get a plugin by name."""
+        return cls._all_plugins.get(name)
+    @classmethod
+    def get_all_plugins(cls) -> dict[str, "BasePlugin"]:
+        """Get all registered plugins by name."""
+        return cls._all_plugins.copy()
+    @classmethod
+    def get_plugins_for_stage(cls, stage: int) -> list["BasePlugin"]:
+        """Get all plugins for a specific stage number."""
+        if stage == 1:
+            return cls._splitters.copy()
+        elif stage == 2:
+            return cls._extractors.copy()
+        elif stage == 3:
+            return cls._qualifiers.copy()
+        elif stage == 4:
+            return cls._labelers.copy()
+        elif stage == 5:
+            return cls._taxonomy_classifiers.copy()
+        return []
+    # =========================================================================
+    # Info methods
+    # =========================================================================
+    @classmethod
+    def list_plugins(cls, stage: int | None = None) -> list[dict]:
+        """
+        List all plugins with their info.
+        Args:
+            stage: Optional stage number to filter by
+        Returns:
+            List of plugin info dicts with name, stage, priority, description
+        """
+        result = []
+        plugins_by_stage = [
+            (1, "splitting", cls._splitters),
+            (2, "extraction", cls._extractors),
+            (3, "qualification", cls._qualifiers),
+            (4, "labeling", cls._labelers),
+            (5, "taxonomy", cls._taxonomy_classifiers),
+            # Content acquisition plugins (stage 0)
+            (0, "scraper", cls._scrapers),
+            (-1, "pdf_parser", cls._pdf_parsers),
+        ]
+        for stage_num, stage_name, plugins in plugins_by_stage:
+            if stage is not None and stage != stage_num:
+                continue
+            for plugin in plugins:
+                info = {
+                    "name": plugin.name,
+                    "stage": stage_num,
+                    "stage_name": stage_name,
+                    "priority": plugin.priority,
+                    "capabilities": plugin.capabilities.name if plugin.capabilities else "NONE",
+                }
+                # Add entity types for qualifiers/canonicalizers
+                if hasattr(plugin, "supported_entity_types"):
+                    info["entity_types"] = [t.value for t in plugin.supported_entity_types]
+                # Add label type for labelers
+                if hasattr(plugin, "label_type"):
+                    info["label_type"] = plugin.label_type
+                # Add taxonomy name for taxonomy classifiers
+                if hasattr(plugin, "taxonomy_name"):
+                    info["taxonomy_name"] = plugin.taxonomy_name
+                result.append(info)
+        return result

statement_extractor/plugins/__init__.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""
+Plugins module for the extraction pipeline.
+Contains all plugin implementations organized by stage:
+- splitters/: Stage 1 - Text to atomic triples
+- extractors/: Stage 2 - Refine entities and relations
+- qualifiers/: Stage 3 - Qualify entities (add identifiers, canonical names, FQN)
+- labelers/: Stage 4 - Classify statements
+- taxonomy/: Stage 5 - Taxonomy classification
+"""
+from .base import (
+    PluginCapability,
+    BasePlugin,
+    BaseSplitterPlugin,
+    BaseExtractorPlugin,
+    BaseQualifierPlugin,
+    BaseLabelerPlugin,
+    BaseTaxonomyPlugin,
+    # Content acquisition plugins
+    ContentType,
+    ScraperResult,
+    PDFParseResult,
+    BaseScraperPlugin,
+    BasePDFParserPlugin,
+)
+# Import plugin modules for auto-registration
+from . import splitters, extractors, qualifiers, labelers, taxonomy
+# Content acquisition plugins
+from . import scrapers, pdf
+__all__ = [
+    "PluginCapability",
+    "BasePlugin",
+    "BaseSplitterPlugin",
+    "BaseExtractorPlugin",
+    "BaseQualifierPlugin",
+    "BaseLabelerPlugin",
+    "BaseTaxonomyPlugin",
+    # Content acquisition plugins
+    "ContentType",
+    "ScraperResult",
+    "PDFParseResult",
+    "BaseScraperPlugin",
+    "BasePDFParserPlugin",
+    # Plugin modules
+    "splitters",
+    "extractors",
+    "qualifiers",
+    "labelers",
+    "taxonomy",
+    "scrapers",
+    "pdf",
+]

corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl