PyPI - corp-extractor - Versions diffs - 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +10 -1
statement_extractor/cli.py +1663 -17
statement_extractor/data/default_predicates.json +368 -0
statement_extractor/data/statement_taxonomy.json +6972 -0
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -23
statement_extractor/gliner_extraction.py +4 -74
statement_extractor/llm.py +255 -0
statement_extractor/models/__init__.py +89 -0
statement_extractor/models/canonical.py +182 -0
statement_extractor/models/document.py +308 -0
statement_extractor/models/entity.py +102 -0
statement_extractor/models/labels.py +220 -0
statement_extractor/models/qualifiers.py +139 -0
statement_extractor/models/statement.py +101 -0
statement_extractor/models.py +4 -1
statement_extractor/pipeline/__init__.py +39 -0
statement_extractor/pipeline/config.py +129 -0
statement_extractor/pipeline/context.py +177 -0
statement_extractor/pipeline/orchestrator.py +416 -0
statement_extractor/pipeline/registry.py +303 -0
statement_extractor/plugins/__init__.py +55 -0
statement_extractor/plugins/base.py +716 -0
statement_extractor/plugins/extractors/__init__.py +13 -0
statement_extractor/plugins/extractors/base.py +9 -0
statement_extractor/plugins/extractors/gliner2.py +546 -0
statement_extractor/plugins/labelers/__init__.py +29 -0
statement_extractor/plugins/labelers/base.py +9 -0
statement_extractor/plugins/labelers/confidence.py +138 -0
statement_extractor/plugins/labelers/relation_type.py +87 -0
statement_extractor/plugins/labelers/sentiment.py +159 -0
statement_extractor/plugins/labelers/taxonomy.py +386 -0
statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +30 -0
statement_extractor/plugins/qualifiers/base.py +9 -0
statement_extractor/plugins/qualifiers/companies_house.py +185 -0
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +197 -0
statement_extractor/plugins/qualifiers/person.py +785 -0
statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/__init__.py +13 -0
statement_extractor/plugins/splitters/base.py +9 -0
statement_extractor/plugins/splitters/t5_gemma.py +293 -0
statement_extractor/plugins/taxonomy/__init__.py +13 -0
statement_extractor/plugins/taxonomy/embedding.py +484 -0
statement_extractor/plugins/taxonomy/mnli.py +291 -0
statement_extractor/scoring.py +8 -8
corp_extractor-0.4.0.dist-info/RECORD +0 -12
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

statement_extractor/pipeline/context.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+PipelineContext - Data container that flows through all pipeline stages.
+The context accumulates outputs from each stage:
+- Stage 1 (Splitting): raw_triples
+- Stage 2 (Extraction): statements
+- Stage 3 (Qualification): qualified_entities
+- Stage 4 (Canonicalization): canonical_entities
+- Stage 5 (Labeling): labeled_statements
+"""
+from typing import Any, Optional
+from pydantic import BaseModel, Field
+from ..models import (
+    RawTriple,
+    PipelineStatement,
+    QualifiedEntity,
+    CanonicalEntity,
+    LabeledStatement,
+    TaxonomyResult,
+)
+class PipelineContext(BaseModel):
+    """
+    Context object that flows through all pipeline stages.
+    Accumulates outputs from each stage and provides access to
+    source text, metadata, and intermediate results.
+    """
+    # Input
+    source_text: str = Field(..., description="Original input text")
+    source_metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Metadata about the source (e.g., document ID, URL, timestamp)"
+    )
+    # Stage 1 output: Raw triples from splitting
+    raw_triples: list[RawTriple] = Field(
+        default_factory=list,
+        description="Raw triples from Stage 1 (Splitting)"
+    )
+    # Stage 2 output: Statements with extracted entities
+    statements: list[PipelineStatement] = Field(
+        default_factory=list,
+        description="Statements from Stage 2 (Extraction)"
+    )
+    # Stage 3 output: Qualified entities (keyed by entity_ref)
+    qualified_entities: dict[str, QualifiedEntity] = Field(
+        default_factory=dict,
+        description="Qualified entities from Stage 3 (Qualification)"
+    )
+    # Stage 4 output: Canonical entities (keyed by entity_ref)
+    canonical_entities: dict[str, CanonicalEntity] = Field(
+        default_factory=dict,
+        description="Canonical entities from Stage 4 (Canonicalization)"
+    )
+    # Stage 5 output: Final labeled statements
+    labeled_statements: list[LabeledStatement] = Field(
+        default_factory=list,
+        description="Final labeled statements from Stage 5 (Labeling)"
+    )
+    # Classification results from extractor (populated by GLiNER2 or similar)
+    # Keyed by source_text -> label_type -> (label_value, confidence)
+    classification_results: dict[str, dict[str, tuple[str, float]]] = Field(
+        default_factory=dict,
+        description="Pre-computed classification results from Stage 2 extractor"
+    )
+    # Stage 6 output: Taxonomy classifications
+    # Keyed by (source_text, taxonomy_name) -> list of TaxonomyResult
+    # Multiple labels may match a single statement above threshold
+    taxonomy_results: dict[tuple[str, str], list[TaxonomyResult]] = Field(
+        default_factory=dict,
+        description="Taxonomy classifications from Stage 6 (multiple labels per statement)"
+    )
+    # Processing metadata
+    processing_errors: list[str] = Field(
+        default_factory=list,
+        description="Errors encountered during processing"
+    )
+    processing_warnings: list[str] = Field(
+        default_factory=list,
+        description="Warnings generated during processing"
+    )
+    stage_timings: dict[str, float] = Field(
+        default_factory=dict,
+        description="Timing information for each stage (stage_name -> seconds)"
+    )
+    def add_error(self, error: str) -> None:
+        """Add a processing error."""
+        self.processing_errors.append(error)
+    def add_warning(self, warning: str) -> None:
+        """Add a processing warning."""
+        self.processing_warnings.append(warning)
+    def record_timing(self, stage: str, duration: float) -> None:
+        """Record timing for a stage."""
+        self.stage_timings[stage] = duration
+    def get_entity_refs(self) -> set[str]:
+        """Get all unique entity refs from statements."""
+        refs = set()
+        for stmt in self.statements:
+            refs.add(stmt.subject.entity_ref)
+            refs.add(stmt.object.entity_ref)
+        return refs
+    def get_qualified_entity(self, entity_ref: str) -> Optional[QualifiedEntity]:
+        """Get qualified entity by ref, or None if not found."""
+        return self.qualified_entities.get(entity_ref)
+    def get_canonical_entity(self, entity_ref: str) -> Optional[CanonicalEntity]:
+        """Get canonical entity by ref, or None if not found."""
+        return self.canonical_entities.get(entity_ref)
+    def get_classification(
+        self,
+        source_text: str,
+        label_type: str,
+    ) -> Optional[tuple[str, float]]:
+        """
+        Get pre-computed classification result for a source text.
+        Args:
+            source_text: The source text that was classified
+            label_type: The type of label (e.g., "sentiment")
+        Returns:
+            Tuple of (label_value, confidence) or None if not found
+        """
+        if source_text in self.classification_results:
+            return self.classification_results[source_text].get(label_type)
+        return None
+    def set_classification(
+        self,
+        source_text: str,
+        label_type: str,
+        label_value: str,
+        confidence: float,
+    ) -> None:
+        """
+        Store a classification result for a source text.
+        Args:
+            source_text: The source text that was classified
+            label_type: The type of label (e.g., "sentiment")
+            label_value: The classification result (e.g., "positive")
+            confidence: Confidence score (0.0 to 1.0)
+        """
+        if source_text not in self.classification_results:
+            self.classification_results[source_text] = {}
+        self.classification_results[source_text][label_type] = (label_value, confidence)
+    @property
+    def has_errors(self) -> bool:
+        """Check if any errors occurred during processing."""
+        return len(self.processing_errors) > 0
+    @property
+    def statement_count(self) -> int:
+        """Get the number of statements in the final output."""
+        return len(self.labeled_statements) if self.labeled_statements else len(self.statements)
+    class Config:
+        arbitrary_types_allowed = True

statement_extractor/pipeline/orchestrator.py ADDED Viewed

@@ -0,0 +1,416 @@
+"""
+ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
+Coordinates the flow of data through all pipeline stages:
+1. Splitting: Text → RawTriple
+2. Extraction: RawTriple → PipelineStatement
+3. Qualification: Entity → CanonicalEntity
+4. Labeling: Statement → LabeledStatement
+5. Taxonomy: Statement → TaxonomyResult
+"""
+import logging
+import time
+from typing import Any, Optional
+from .context import PipelineContext
+from .config import PipelineConfig, get_stage_name
+from .registry import PluginRegistry
+from ..models import (
+    QualifiedEntity,
+    CanonicalEntity,
+    LabeledStatement,
+    TaxonomyResult,
+)
+logger = logging.getLogger(__name__)
+class ExtractionPipeline:
+    """
+    Main pipeline orchestrator.
+    Coordinates the flow of data through all 5 stages:
+    1. Splitting: Text → RawTriple (using splitter plugins)
+    2. Extraction: RawTriple → PipelineStatement (using extractor plugins)
+    3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
+    4. Labeling: Statement → LabeledStatement (using labeler plugins)
+    5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
+    """
+    def __init__(self, config: Optional[PipelineConfig] = None):
+        """
+        Initialize the pipeline.
+        Args:
+            config: Pipeline configuration (uses defaults if not provided)
+        """
+        self.config = config or PipelineConfig.default()
+    def process(
+        self,
+        text: str,
+        metadata: Optional[dict[str, Any]] = None,
+    ) -> PipelineContext:
+        """
+        Process text through the extraction pipeline.
+        Args:
+            text: Input text to process
+            metadata: Optional metadata about the source
+        Returns:
+            PipelineContext with accumulated results from all stages
+        """
+        # Merge config options into metadata for plugins
+        combined_metadata = metadata.copy() if metadata else {}
+        # Pass extractor options from config to context
+        if self.config.extractor_options:
+            existing_extractor_opts = combined_metadata.get("extractor_options", {})
+            combined_metadata["extractor_options"] = {
+                **self.config.extractor_options,
+                **existing_extractor_opts,  # Allow explicit metadata to override config
+            }
+        ctx = PipelineContext(
+            source_text=text,
+            source_metadata=combined_metadata,
+        )
+        logger.info(f"Starting pipeline processing: {len(text)} chars")
+        try:
+            # Stage 1: Splitting
+            if self.config.is_stage_enabled(1):
+                ctx = self._run_splitting(ctx)
+            # Stage 2: Extraction
+            if self.config.is_stage_enabled(2):
+                ctx = self._run_extraction(ctx)
+            # Stage 3: Qualification (runs qualifiers + canonicalizers)
+            if self.config.is_stage_enabled(3):
+                ctx = self._run_qualification(ctx)
+            # Stage 4: Labeling
+            if self.config.is_stage_enabled(4):
+                ctx = self._run_labeling(ctx)
+            # Stage 5: Taxonomy classification
+            if self.config.is_stage_enabled(5):
+                ctx = self._run_taxonomy(ctx)
+        except Exception as e:
+            logger.exception("Pipeline processing failed")
+            ctx.add_error(f"Pipeline error: {str(e)}")
+            if self.config.fail_fast:
+                raise
+        logger.info(
+            f"Pipeline complete: {ctx.statement_count} statements, "
+            f"{len(ctx.processing_errors)} errors"
+        )
+        return ctx
+    def _run_splitting(self, ctx: PipelineContext) -> PipelineContext:
+        """Stage 1: Split text into raw triples."""
+        stage_name = get_stage_name(1)
+        logger.debug(f"Running {stage_name} stage")
+        start_time = time.time()
+        splitters = PluginRegistry.get_splitters()
+        if not splitters:
+            ctx.add_warning("No splitter plugins registered")
+            return ctx
+        # Use first enabled splitter (highest priority)
+        for splitter in splitters:
+            if not self.config.is_plugin_enabled(splitter.name):
+                continue
+            logger.debug(f"Using splitter: {splitter.name}")
+            try:
+                raw_triples = splitter.split(ctx.source_text, ctx)
+                ctx.raw_triples = raw_triples
+                logger.info(f"Splitting produced {len(raw_triples)} raw triples")
+                break
+            except Exception as e:
+                logger.exception(f"Splitter {splitter.name} failed")
+                ctx.add_error(f"Splitter {splitter.name} failed: {str(e)}")
+                if self.config.fail_fast:
+                    raise
+        ctx.record_timing(stage_name, time.time() - start_time)
+        return ctx
+    def _run_extraction(self, ctx: PipelineContext) -> PipelineContext:
+        """Stage 2: Extract statements with typed entities from raw triples."""
+        stage_name = get_stage_name(2)
+        logger.debug(f"Running {stage_name} stage")
+        start_time = time.time()
+        if not ctx.raw_triples:
+            logger.debug("No raw triples to extract from")
+            return ctx
+        extractors = PluginRegistry.get_extractors()
+        if not extractors:
+            ctx.add_warning("No extractor plugins registered")
+            return ctx
+        # Collect classification schemas from labelers for the extractor
+        classification_schemas = self._collect_classification_schemas()
+        if classification_schemas:
+            logger.debug(f"Collected {len(classification_schemas)} classification schemas from labelers")
+        # Use first enabled extractor (highest priority)
+        for extractor in extractors:
+            if not self.config.is_plugin_enabled(extractor.name):
+                continue
+            # Pass classification schemas to extractor if it supports them
+            if classification_schemas and hasattr(extractor, 'add_classification_schema'):
+                for schema in classification_schemas:
+                    extractor.add_classification_schema(schema)
+            logger.debug(f"Using extractor: {extractor.name}")
+            try:
+                statements = extractor.extract(ctx.raw_triples, ctx)
+                ctx.statements = statements
+                logger.info(f"Extraction produced {len(statements)} statements")
+                break
+            except Exception as e:
+                logger.exception(f"Extractor {extractor.name} failed")
+                ctx.add_error(f"Extractor {extractor.name} failed: {str(e)}")
+                if self.config.fail_fast:
+                    raise
+        ctx.record_timing(stage_name, time.time() - start_time)
+        return ctx
+    def _collect_classification_schemas(self) -> list:
+        """Collect classification schemas from enabled labelers."""
+        schemas = []
+        labelers = PluginRegistry.get_labelers()
+        for labeler in labelers:
+            if not self.config.is_plugin_enabled(labeler.name):
+                continue
+            # Check for classification schema (simple multi-choice)
+            if hasattr(labeler, 'classification_schema') and labeler.classification_schema:
+                schemas.append(labeler.classification_schema)
+                logger.debug(
+                    f"Labeler {labeler.name} provides classification schema: "
+                    f"{labeler.classification_schema}"
+                )
+        return schemas
+    def _run_qualification(self, ctx: PipelineContext) -> PipelineContext:
+        """
+        Stage 3: Qualify entities with identifiers, canonical names, and FQNs.
+        Runs qualifier plugins for each entity type. Qualifier plugins now return
+        CanonicalEntity directly (with qualifiers, canonical match, and FQN).
+        """
+        stage_name = get_stage_name(3)
+        logger.debug(f"Running {stage_name} stage")
+        start_time = time.time()
+        if not ctx.statements:
+            logger.debug("No statements to qualify")
+            return ctx
+        # Collect all unique entities from statements
+        entities_to_qualify = {}
+        for stmt in ctx.statements:
+            for entity in [stmt.subject, stmt.object]:
+                if entity.entity_ref not in entities_to_qualify:
+                    entities_to_qualify[entity.entity_ref] = entity
+        logger.info(f"Stage 3: Qualifying {len(entities_to_qualify)} unique entities")
+        # Process each entity through qualifier plugins
+        entities_list = list(entities_to_qualify.items())
+        for idx, (entity_ref, entity) in enumerate(entities_list, 1):
+            logger.info(f"  [{idx}/{len(entities_list)}] Qualifying '{entity.text}' ({entity.type.value})")
+            # Run qualifier plugins - first one to return a result wins
+            canonical = None
+            type_qualifiers = PluginRegistry.get_qualifiers_for_type(entity.type)
+            for qualifier_plugin in type_qualifiers:
+                if not self.config.is_plugin_enabled(qualifier_plugin.name):
+                    continue
+                try:
+                    result = qualifier_plugin.qualify(entity, ctx)
+                    if result is not None:
+                        canonical = result
+                        logger.info(f"    Qualified by {qualifier_plugin.name}: {canonical.fqn}")
+                        break  # Use first successful match
+                except Exception as e:
+                    logger.error(f"Qualifier {qualifier_plugin.name} failed for {entity.text}: {e}")
+                    ctx.add_error(f"Qualifier {qualifier_plugin.name} failed: {str(e)}")
+                    if self.config.fail_fast:
+                        raise
+            # Create fallback CanonicalEntity if no plugin matched
+            if canonical is None:
+                qualified = QualifiedEntity(
+                    entity_ref=entity_ref,
+                    original_text=entity.text,
+                    entity_type=entity.type,
+                )
+                canonical = CanonicalEntity.from_qualified(qualified=qualified)
+                logger.debug(f"    No qualification found, using original text")
+            ctx.canonical_entities[entity_ref] = canonical
+        logger.info(f"Qualified {len(ctx.canonical_entities)} entities")
+        ctx.record_timing(stage_name, time.time() - start_time)
+        return ctx
+    def _run_labeling(self, ctx: PipelineContext) -> PipelineContext:
+        """Stage 4: Apply labels to statements."""
+        stage_name = get_stage_name(4)
+        logger.debug(f"Running {stage_name} stage")
+        start_time = time.time()
+        if not ctx.statements:
+            logger.debug("No statements to label")
+            return ctx
+        # Ensure canonical entities exist (run qualification if skipped)
+        if not ctx.canonical_entities:
+            self._run_qualification(ctx)
+        labelers = PluginRegistry.get_labelers()
+        for stmt in ctx.statements:
+            # Get canonical entities
+            subj_canonical = ctx.canonical_entities.get(stmt.subject.entity_ref)
+            obj_canonical = ctx.canonical_entities.get(stmt.object.entity_ref)
+            if not subj_canonical or not obj_canonical:
+                # Create fallback canonical entities
+                if not subj_canonical:
+                    subj_qualified = ctx.qualified_entities.get(
+                        stmt.subject.entity_ref,
+                        QualifiedEntity(
+                            entity_ref=stmt.subject.entity_ref,
+                            original_text=stmt.subject.text,
+                            entity_type=stmt.subject.type,
+                        )
+                    )
+                    subj_canonical = CanonicalEntity.from_qualified(subj_qualified)
+                if not obj_canonical:
+                    obj_qualified = ctx.qualified_entities.get(
+                        stmt.object.entity_ref,
+                        QualifiedEntity(
+                            entity_ref=stmt.object.entity_ref,
+                            original_text=stmt.object.text,
+                            entity_type=stmt.object.type,
+                        )
+                    )
+                    obj_canonical = CanonicalEntity.from_qualified(obj_qualified)
+            # Create labeled statement
+            labeled = LabeledStatement(
+                statement=stmt,
+                subject_canonical=subj_canonical,
+                object_canonical=obj_canonical,
+            )
+            # Apply all labelers
+            for labeler in labelers:
+                if not self.config.is_plugin_enabled(labeler.name):
+                    continue
+                try:
+                    label = labeler.label(stmt, subj_canonical, obj_canonical, ctx)
+                    if label:
+                        labeled.add_label(label)
+                except Exception as e:
+                    logger.error(f"Labeler {labeler.name} failed: {e}")
+                    ctx.add_error(f"Labeler {labeler.name} failed: {str(e)}")
+                    if self.config.fail_fast:
+                        raise
+            ctx.labeled_statements.append(labeled)
+        logger.info(f"Labeled {len(ctx.labeled_statements)} statements")
+        ctx.record_timing(stage_name, time.time() - start_time)
+        return ctx
+    def _run_taxonomy(self, ctx: PipelineContext) -> PipelineContext:
+        """Stage 5: Classify statements against taxonomies."""
+        from ..plugins.base import PluginCapability
+        stage_name = get_stage_name(5)
+        logger.debug(f"Running {stage_name} stage")
+        start_time = time.time()
+        if not ctx.labeled_statements:
+            logger.debug("No labeled statements to classify")
+            return ctx
+        taxonomy_classifiers = PluginRegistry.get_taxonomy_classifiers()
+        if not taxonomy_classifiers:
+            logger.debug("No taxonomy classifiers registered")
+            return ctx
+        total_results = 0
+        # Prepare batch items: list of (statement, subject_canonical, object_canonical)
+        batch_items = [
+            (labeled_stmt.statement, labeled_stmt.subject_canonical, labeled_stmt.object_canonical)
+            for labeled_stmt in ctx.labeled_statements
+        ]
+        # Apply all taxonomy classifiers
+        for classifier in taxonomy_classifiers:
+            if not self.config.is_plugin_enabled(classifier.name):
+                continue
+            try:
+                # Require batch processing capability
+                if PluginCapability.BATCH_PROCESSING not in classifier.capabilities:
+                    raise RuntimeError(
+                        f"Taxonomy classifier '{classifier.name}' does not support batch processing. "
+                        "Pipeline requires BATCH_PROCESSING capability for efficient GPU utilization."
+                    )
+                logger.debug(f"Using batch classification for {classifier.name} ({len(batch_items)} items)")
+                batch_results = classifier.classify_batch(batch_items, ctx)
+                # Apply results to each labeled statement
+                for labeled_stmt, results in zip(ctx.labeled_statements, batch_results):
+                    if results:
+                        stmt = labeled_stmt.statement
+                        key = (stmt.source_text, classifier.taxonomy_name)
+                        if key not in ctx.taxonomy_results:
+                            ctx.taxonomy_results[key] = []
+                        ctx.taxonomy_results[key].extend(results)
+                        total_results += len(results)
+                        labeled_stmt.taxonomy_results.extend(results)
+                        for result in results:
+                            logger.debug(
+                                f"Taxonomy {classifier.name}: {result.full_label} "
+                                f"(confidence={result.confidence:.2f})"
+                            )
+            except Exception as e:
+                logger.error(f"Taxonomy classifier {classifier.name} failed: {e}")
+                ctx.add_error(f"Taxonomy classifier {classifier.name} failed: {str(e)}")
+                if self.config.fail_fast:
+                    raise
+        logger.info(f"Taxonomy produced {total_results} labels across {len(ctx.taxonomy_results)} statement-taxonomy pairs")
+        ctx.record_timing(stage_name, time.time() - start_time)
+        return ctx

corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.4.0py3-none-any.whl → 0.9.0py3-none-any.whl