PyPI - corp-extractor - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl - Mend

corp-extractor 0.9.0py3-none-any.whl → 0.9.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
statement_extractor/cli.py +1317 -101
statement_extractor/database/embeddings.py +45 -0
statement_extractor/database/hub.py +86 -136
statement_extractor/database/importers/__init__.py +10 -2
statement_extractor/database/importers/companies_house.py +16 -2
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +23 -0
statement_extractor/database/importers/import_utils.py +264 -0
statement_extractor/database/importers/sec_edgar.py +17 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +151 -43
statement_extractor/database/importers/wikidata_dump.py +2282 -0
statement_extractor/database/importers/wikidata_people.py +867 -325
statement_extractor/database/migrate_v2.py +852 -0
statement_extractor/database/models.py +155 -7
statement_extractor/database/schema_v2.py +409 -0
statement_extractor/database/seed_data.py +359 -0
statement_extractor/database/store.py +3449 -233
statement_extractor/document/deduplicator.py +10 -12
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +3 -2
statement_extractor/models/statement.py +15 -17
statement_extractor/models.py +1 -1
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +12 -12
statement_extractor/plugins/base.py +17 -17
statement_extractor/plugins/extractors/gliner2.py +28 -28
statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
statement_extractor/plugins/qualifiers/person.py +120 -53
statement_extractor/plugins/splitters/t5_gemma.py +35 -39
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0

statement_extractor/document/deduplicator.py CHANGED Viewed

@@ -2,6 +2,8 @@
 StatementDeduplicator - Hash-based deduplication for statements.
 Removes duplicate statements across chunks using normalized hashing.
+Works with Stage 2+ output (PipelineStatement, LabeledStatement) which
+have subject-predicate-object structure.
 """
 import hashlib
@@ -9,12 +11,12 @@ import logging
 from typing import TypeVar, Union
 from ..models.labels import LabeledStatement
-from ..models.statement import PipelineStatement, RawTriple
+from ..models.statement import PipelineStatement
 logger = logging.getLogger(__name__)
 # Type variable for generic deduplication
-T = TypeVar("T", RawTriple, PipelineStatement, LabeledStatement)
+T = TypeVar("T", PipelineStatement, LabeledStatement)
 class StatementDeduplicator:
@@ -23,6 +25,8 @@ class StatementDeduplicator:
     Uses a hash of normalized (subject, predicate, object) to identify
     duplicates. Keeps the first occurrence of each unique statement.
+    Works with PipelineStatement (Stage 2) and LabeledStatement (Stage 4).
     """
     def __init__(self):
@@ -46,20 +50,14 @@ class StatementDeduplicator:
     def _get_triple_parts(
         self,
-        stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
+        stmt: Union[PipelineStatement, LabeledStatement],
     ) -> tuple[str, str, str]:
         """
         Extract (subject, predicate, object) from a statement.
         Handles different statement types consistently.
         """
-        if isinstance(stmt, RawTriple):
-            return (
-                stmt.subject_text,
-                stmt.predicate_text,
-                stmt.object_text,
-            )
-        elif isinstance(stmt, LabeledStatement):
+        if isinstance(stmt, LabeledStatement):
             return (
                 stmt.statement.subject.text,
                 stmt.statement.predicate,
@@ -75,7 +73,7 @@ class StatementDeduplicator:
     def _hash_triple(
         self,
-        stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
+        stmt: Union[PipelineStatement, LabeledStatement],
     ) -> str:
         """
         Generate a hash for a statement triple.
@@ -96,7 +94,7 @@ class StatementDeduplicator:
     def is_duplicate(
         self,
-        stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
+        stmt: Union[PipelineStatement, LabeledStatement],
     ) -> bool:
         """
         Check if a statement is a duplicate.

statement_extractor/extractor.py CHANGED Viewed

@@ -392,7 +392,7 @@ class StatementExtractor:
         This is the new extraction pipeline that:
         1. Generates multiple candidates via DBS
         2. Parses each to statements
-        3. Scores each triple for groundedness
+        3. Scores each triple for quality (semantic + entity)
         4. Merges top beams or selects best beam
         5. Deduplicates using embeddings (if enabled)
         """

statement_extractor/models/__init__.py CHANGED Viewed

@@ -43,7 +43,7 @@ else:
 # New pipeline models
 from .entity import ExtractedEntity
-from .statement import RawTriple, PipelineStatement
+from .statement import SplitSentence, RawTriple, PipelineStatement
 from .qualifiers import EntityQualifiers, QualifiedEntity, ResolvedRole, ResolvedOrganization
 from .canonical import CanonicalMatch, CanonicalEntity
 from .labels import StatementLabel, LabeledStatement, TaxonomyResult
@@ -69,7 +69,8 @@ __all__ = [
     "ExtractionOptions",
     # New pipeline models
     "ExtractedEntity",
-    "RawTriple",
+    "SplitSentence",
+    "RawTriple",  # Backwards compatibility alias for SplitSentence
     "PipelineStatement",
     "EntityQualifiers",
     "QualifiedEntity",

statement_extractor/models/statement.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 Statement models for the extraction pipeline.
-RawTriple: Output of Stage 1 (Splitting)
-PipelineStatement: Output of Stage 2 (Extraction) with refined entities
+SplitSentence: Output of Stage 1 (Splitting) - atomic sentences/statements
+PipelineStatement: Output of Stage 2 (Extraction) with subject-predicate-object triples
 """
 from typing import Optional
@@ -12,22 +12,20 @@ from pydantic import BaseModel, Field
 from .entity import ExtractedEntity
-class RawTriple(BaseModel):
+class SplitSentence(BaseModel):
     """
-    A raw triple from Stage 1 (Splitting).
+    An atomic sentence from Stage 1 (Splitting).
-    Contains the basic text components before entity refinement.
-    Generated by T5-Gemma or other splitting plugins.
+    Stage 1 splits text into atomic sentences that can each be converted
+    to subject-predicate-object triples in Stage 2. Generated by T5-Gemma
+    or other splitting plugins.
     """
-    subject_text: str = Field(..., description="Raw subject text")
-    predicate_text: str = Field(..., description="Raw predicate text")
-    object_text: str = Field(..., description="Raw object text")
-    source_sentence: str = Field(..., description="The source sentence this triple was extracted from")
+    text: str = Field(..., description="The atomic sentence text")
     confidence: float = Field(
         default=1.0,
         ge=0.0,
         le=1.0,
-        description="Extraction confidence from the splitter"
+        description="Confidence that this is a valid atomic statement"
     )
     # Document tracking fields
     document_id: Optional[str] = Field(
@@ -36,19 +34,19 @@ class RawTriple(BaseModel):
     )
     page_number: Optional[int] = Field(
         None,
-        description="Page number where this triple was extracted (1-indexed)"
+        description="Page number where this sentence was extracted (1-indexed)"
     )
     chunk_index: Optional[int] = Field(
         None,
-        description="Index of the chunk this triple was extracted from (0-indexed)"
+        description="Index of the chunk this sentence was extracted from (0-indexed)"
     )
     def __str__(self) -> str:
-        return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
+        return self.text
-    def as_tuple(self) -> tuple[str, str, str]:
-        """Return as a simple (subject, predicate, object) tuple."""
-        return (self.subject_text, self.predicate_text, self.object_text)
+# Backwards compatibility alias
+RawTriple = SplitSentence
 class PipelineStatement(BaseModel):

statement_extractor/models.py CHANGED Viewed

@@ -217,7 +217,7 @@ class ScoringConfig(BaseModel):
     quality_weight: float = Field(
         default=1.0,
         ge=0.0,
-        description="Weight for groundedness/quality scores in beam selection"
+        description="Weight for confidence scores in beam selection"
     )
     coverage_weight: float = Field(
         default=0.5,

statement_extractor/pipeline/context.py CHANGED Viewed

@@ -2,7 +2,7 @@
 PipelineContext - Data container that flows through all pipeline stages.
 The context accumulates outputs from each stage:
-- Stage 1 (Splitting): raw_triples
+- Stage 1 (Splitting): split_sentences
 - Stage 2 (Extraction): statements
 - Stage 3 (Qualification): qualified_entities
 - Stage 4 (Canonicalization): canonical_entities
@@ -14,7 +14,7 @@ from typing import Any, Optional
 from pydantic import BaseModel, Field
 from ..models import (
-    RawTriple,
+    SplitSentence,
     PipelineStatement,
     QualifiedEntity,
     CanonicalEntity,
@@ -37,10 +37,10 @@ class PipelineContext(BaseModel):
         description="Metadata about the source (e.g., document ID, URL, timestamp)"
     )
-    # Stage 1 output: Raw triples from splitting
-    raw_triples: list[RawTriple] = Field(
+    # Stage 1 output: Split sentences
+    split_sentences: list[SplitSentence] = Field(
         default_factory=list,
-        description="Raw triples from Stage 1 (Splitting)"
+        description="Atomic sentences from Stage 1 (Splitting)"
     )
     # Stage 2 output: Statements with extracted entities

statement_extractor/pipeline/orchestrator.py CHANGED Viewed

@@ -2,8 +2,8 @@
 ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
 Coordinates the flow of data through all pipeline stages:
-1. Splitting: Text → RawTriple
-2. Extraction: RawTriple → PipelineStatement
+1. Splitting: Text → SplitSentence (atomic sentences)
+2. Extraction: SplitSentence → PipelineStatement (subject-predicate-object triples)
 3. Qualification: Entity → CanonicalEntity
 4. Labeling: Statement → LabeledStatement
 5. Taxonomy: Statement → TaxonomyResult
@@ -31,8 +31,8 @@ class ExtractionPipeline:
     Main pipeline orchestrator.
     Coordinates the flow of data through all 5 stages:
-    1. Splitting: Text → RawTriple (using splitter plugins)
-    2. Extraction: RawTriple → PipelineStatement (using extractor plugins)
+    1. Splitting: Text → SplitSentence (using splitter plugins)
+    2. Extraction: SplitSentence → PipelineStatement (using extractor plugins)
     3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
     4. Labeling: Statement → LabeledStatement (using labeler plugins)
     5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
@@ -115,7 +115,7 @@ class ExtractionPipeline:
         return ctx
     def _run_splitting(self, ctx: PipelineContext) -> PipelineContext:
-        """Stage 1: Split text into raw triples."""
+        """Stage 1: Split text into atomic sentences."""
         stage_name = get_stage_name(1)
         logger.debug(f"Running {stage_name} stage")
         start_time = time.time()
@@ -132,9 +132,9 @@ class ExtractionPipeline:
             logger.debug(f"Using splitter: {splitter.name}")
             try:
-                raw_triples = splitter.split(ctx.source_text, ctx)
-                ctx.raw_triples = raw_triples
-                logger.info(f"Splitting produced {len(raw_triples)} raw triples")
+                split_sentences = splitter.split(ctx.source_text, ctx)
+                ctx.split_sentences = split_sentences
+                logger.info(f"Splitting produced {len(split_sentences)} sentences")
                 break
             except Exception as e:
                 logger.exception(f"Splitter {splitter.name} failed")
@@ -146,13 +146,13 @@ class ExtractionPipeline:
         return ctx
     def _run_extraction(self, ctx: PipelineContext) -> PipelineContext:
-        """Stage 2: Extract statements with typed entities from raw triples."""
+        """Stage 2: Extract subject-predicate-object triples from split sentences."""
         stage_name = get_stage_name(2)
         logger.debug(f"Running {stage_name} stage")
         start_time = time.time()
-        if not ctx.raw_triples:
-            logger.debug("No raw triples to extract from")
+        if not ctx.split_sentences:
+            logger.debug("No split sentences to extract from")
             return ctx
         extractors = PluginRegistry.get_extractors()
@@ -177,7 +177,7 @@ class ExtractionPipeline:
             logger.debug(f"Using extractor: {extractor.name}")
             try:
-                statements = extractor.extract(ctx.raw_triples, ctx)
+                statements = extractor.extract(ctx.split_sentences, ctx)
                 ctx.statements = statements
                 logger.info(f"Extraction produced {len(statements)} statements")
                 break

statement_extractor/plugins/base.py CHANGED Viewed

@@ -2,8 +2,8 @@
 Base plugin classes for the extraction pipeline.
 Defines the abstract interfaces for each pipeline stage:
-- BaseSplitterPlugin: Stage 1 - Text → RawTriple
-- BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
+- BaseSplitterPlugin: Stage 1 - Text → SplitSentence (atomic sentences)
+- BaseExtractorPlugin: Stage 2 - SplitSentence → PipelineStatement (triples)
 - BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
 - BaseLabelerPlugin: Stage 4 - Statement → StatementLabel
 - BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
@@ -22,7 +22,7 @@ from pydantic import BaseModel, Field
 if TYPE_CHECKING:
     from ..pipeline.context import PipelineContext
     from ..models import (
-        RawTriple,
+        SplitSentence,
         PipelineStatement,
         ExtractedEntity,
         CanonicalEntity,
@@ -173,10 +173,10 @@ class BasePlugin(ABC):
 class BaseSplitterPlugin(BasePlugin):
     """
-    Stage 1 plugin: Split text into atomic triples.
+    Stage 1 plugin: Split text into atomic sentences.
-    Takes raw text and produces RawTriple objects containing
-    subject/predicate/object text and source sentence.
+    Takes raw text and produces SplitSentence objects containing
+    atomic statements that can be converted to triples in Stage 2.
     """
     @abstractmethod
@@ -184,16 +184,16 @@ class BaseSplitterPlugin(BasePlugin):
         self,
         text: str,
         context: "PipelineContext",
-    ) -> list["RawTriple"]:
+    ) -> list["SplitSentence"]:
         """
-        Split text into atomic triples.
+        Split text into atomic sentences.
         Args:
             text: Input text to split
             context: Pipeline context for accessing metadata and config
         Returns:
-            List of RawTriple objects
+            List of SplitSentence objects
         """
         ...
@@ -201,9 +201,9 @@ class BaseSplitterPlugin(BasePlugin):
         self,
         texts: list[str],
         context: "PipelineContext",
-    ) -> list[list["RawTriple"]]:
+    ) -> list[list["SplitSentence"]]:
         """
-        Split multiple texts into atomic triples in a single batch.
+        Split multiple texts into atomic sentences in a single batch.
         Default implementation calls split() for each text sequentially.
         Plugins with BATCH_PROCESSING capability should override this
@@ -214,16 +214,16 @@ class BaseSplitterPlugin(BasePlugin):
             context: Pipeline context for accessing metadata and config
         Returns:
-            List of RawTriple lists, one per input text
+            List of SplitSentence lists, one per input text
         """
         return [self.split(text, context) for text in texts]
 class BaseExtractorPlugin(BasePlugin):
     """
-    Stage 2 plugin: Refine triples into statements with typed entities.
+    Stage 2 plugin: Extract subject-predicate-object triples from sentences.
-    Takes RawTriple objects and produces PipelineStatement objects
+    Takes SplitSentence objects and produces PipelineStatement objects
     with ExtractedEntity subjects/objects that have types, spans,
     and confidence scores.
     """
@@ -231,14 +231,14 @@ class BaseExtractorPlugin(BasePlugin):
     @abstractmethod
     def extract(
         self,
-        raw_triples: list["RawTriple"],
+        split_sentences: list["SplitSentence"],
         context: "PipelineContext",
     ) -> list["PipelineStatement"]:
         """
-        Extract statements from raw triples.
+        Extract triples from split sentences.
         Args:
-            raw_triples: Raw triples from Stage 1
+            split_sentences: Atomic sentences from Stage 1
             context: Pipeline context
         Returns:

statement_extractor/plugins/extractors/gliner2.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """
-GLiNER2Extractor - Stage 2 plugin that refines triples using GLiNER2.
+GLiNER2Extractor - Stage 2 plugin that extracts triples from sentences.
 Uses GLiNER2 for:
-1. Entity extraction: Refine subject/object boundaries
-2. Relation extraction: When predicate list is provided
+1. Entity extraction: Identify subject/object entities with types
+2. Relation extraction: Extract predicates using predicate list
 3. Entity scoring: Score how entity-like subjects/objects are
 4. Classification: Run labeler classification schemas in single pass
 """
@@ -16,7 +16,7 @@ from typing import Optional
 from ..base import BaseExtractorPlugin, ClassificationSchema, PluginCapability
 from ...pipeline.context import PipelineContext
 from ...pipeline.registry import PluginRegistry
-from ...models import RawTriple, PipelineStatement, ExtractedEntity, EntityType
+from ...models import SplitSentence, PipelineStatement, ExtractedEntity, EntityType
 logger = logging.getLogger(__name__)
@@ -110,11 +110,11 @@ GLINER_TYPE_MAP = {
 @PluginRegistry.extractor
 class GLiNER2Extractor(BaseExtractorPlugin):
     """
-    Extractor plugin that uses GLiNER2 for entity and relation refinement.
+    Extractor plugin that uses GLiNER2 for entity and relation extraction.
-    Processes raw triples from Stage 1 and produces PipelineStatement
-    objects with typed entities. Also runs classification schemas from
-    labeler plugins in a single pass.
+    Processes split sentences from Stage 1 and produces PipelineStatement
+    objects with subject-predicate-object triples and typed entities.
+    Also runs classification schemas from labeler plugins in a single pass.
     """
     def __init__(
@@ -209,36 +209,36 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def extract(
         self,
-        raw_triples: list[RawTriple],
+        split_sentences: list[SplitSentence],
         context: PipelineContext,
     ) -> list[PipelineStatement]:
         """
-        Extract statements from raw triples using GLiNER2.
+        Extract subject-predicate-object triples from split sentences using GLiNER2.
         Returns ALL matching relations from GLiNER2 (not just the best one).
         Also runs any classification schemas and stores results in context.
         Args:
-            raw_triples: Raw triples from Stage 1
+            split_sentences: Atomic sentences from Stage 1
             context: Pipeline context
         Returns:
-            List of PipelineStatement objects (may contain multiple per raw triple)
+            List of PipelineStatement objects (may contain multiple per sentence)
         """
         predicate_categories = self._get_predicate_categories()
-        logger.info(f"GLiNER2Extractor processing {len(raw_triples)} triples")
+        logger.info(f"GLiNER2Extractor processing {len(split_sentences)} sentences")
         logger.info(f"Using {len(predicate_categories)} predicate categories")
         statements = []
         model = self._get_model()
         classified_texts: set[str] = set()
-        for raw in raw_triples:
+        for sentence in split_sentences:
             try:
                 if model:
                     # Use relation extraction iterating through categories
                     # Returns ALL matches, not just the best one
-                    extracted_stmts = self._extract_with_relations(raw, model, predicate_categories)
+                    extracted_stmts = self._extract_with_relations(sentence, model, predicate_categories)
                 else:
                     # No model available - skip
                     logger.warning("No GLiNER2 model available - skipping extraction")
@@ -253,10 +253,10 @@ class GLiNER2Extractor(BaseExtractorPlugin):
                         classified_texts.add(stmt.source_text)
             except Exception as e:
-                logger.warning(f"Error extracting triple: {e}")
-                # No fallback - skip this triple
+                logger.warning(f"Error extracting from sentence: {e}")
+                # No fallback - skip this sentence
-        logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(raw_triples)} raw triples")
+        logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(split_sentences)} sentences")
         return statements
     def _run_classifications(
@@ -316,7 +316,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def _extract_with_relations(
         self,
-        raw: RawTriple,
+        sentence: SplitSentence,
         model,
         predicate_categories: dict[str, dict[str, PredicateConfig]],
     ) -> list[PipelineStatement]:
@@ -328,14 +328,14 @@ class GLiNER2Extractor(BaseExtractorPlugin):
         Returns ALL matching relations, not just the best one.
         Args:
-            raw: Raw triple from Stage 1
+            sentence: Split sentence from Stage 1
             model: GLiNER2 model instance
             predicate_categories: Dict of category -> predicates to use
         Returns:
             List of PipelineStatements for all relations found
         """
-        logger.debug(f"Attempting relation extraction for: '{raw.source_sentence[:80]}...'")
+        logger.debug(f"Attempting relation extraction for: '{sentence.text[:80]}...'")
         # Iterate through each category separately to stay under GLiNER2's ~25 label limit
         # Use schema API with entities + relations together for better extraction
@@ -355,7 +355,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
                     .entities(self._get_entity_types())
                     .relations(relations_dict)
                 )
-                result = model.extract(raw.source_sentence, schema, include_confidence=True)
+                result = model.extract(sentence.text, schema, include_confidence=True)
                 # Get relations from this category
                 relation_data = result.get("relations", result.get("relation_extraction", {}))
@@ -379,7 +379,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
         logger.debug(f"  GLiNER2 found {total_found} total relations across all categories")
         if not all_relations:
-            logger.debug(f"No GLiNER2 relation match in: '{raw.source_sentence[:60]}...'")
+            logger.debug(f"No GLiNER2 relation match in: '{sentence.text[:60]}...'")
             return []
         # Filter by confidence threshold and sort descending
@@ -402,8 +402,8 @@ class GLiNER2Extractor(BaseExtractorPlugin):
             )
             # Get entity types
-            subj_type = self._infer_entity_type(head, model, raw.source_sentence)
-            obj_type = self._infer_entity_type(tail, model, raw.source_sentence)
+            subj_type = self._infer_entity_type(head, model, sentence.text)
+            obj_type = self._infer_entity_type(tail, model, sentence.text)
             logger.debug(f"  Entity types: {subj_type.value}, {obj_type.value}")
             stmt = PipelineStatement(
@@ -419,7 +419,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
                     type=obj_type,
                     confidence=confidence,
                 ),
-                source_text=raw.source_sentence,
+                source_text=sentence.text,
                 confidence_score=confidence,
                 extraction_method="gliner_relation",
             )
@@ -429,7 +429,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
     def _extract_with_entities(
         self,
-        raw: RawTriple,
+        sentence: SplitSentence,
         model,
     ) -> Optional[PipelineStatement]:
         """
@@ -438,7 +438,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
         This method is called when predicates are disabled. Without GLiNER2 relation
         extraction, we cannot form valid statements.
         """
-        logger.debug(f"Entity extraction mode (no predicates) - skipping: '{raw.source_sentence[:60]}...'")
+        logger.debug(f"Entity extraction mode (no predicates) - skipping: '{sentence.text[:60]}...'")
         return None
     def _parse_relation(self, rel) -> tuple[str, str, float]:

statement_extractor/plugins/qualifiers/embedding_company.py CHANGED Viewed

@@ -60,7 +60,7 @@ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
         self,
         db_path: Optional[str] = None,
         top_k: int = 20,
-        min_similarity: float = 0.5,
+        min_similarity: float = 0.3,
         use_llm_confirmation: bool = True,
         auto_download_db: bool = True,
     ):
@@ -215,11 +215,13 @@ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
             self._cache[cache_key] = None
             return None
-        # Log all candidates
-        logger.info(f"    Found {len(results)} candidates for '{entity.text}':")
-        for i, (record, sim) in enumerate(results[:10], 1):
+        # Log all candidates (scores are prominence-adjusted)
+        logger.info(f"    Found {len(results)} candidates for '{entity.text}' (prominence-adjusted):")
+        for i, (record, score) in enumerate(results[:10], 1):
             region_str = f" [{record.region}]" if record.region else ""
-            logger.info(f"      {i}. {record.name}{region_str} (sim={sim:.3f}, source={record.source})")
+            ticker = record.record.get("ticker", "")
+            ticker_str = f" ticker={ticker}" if ticker else ""
+            logger.info(f"      {i}. {record.name}{region_str} (score={score:.3f}, source={record.source}{ticker_str})")
         # Get best match (optionally with LLM confirmation)
         logger.info(f"    Selecting best match (LLM={self._use_llm_confirmation})...")

corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

corp-extractor 0.9.0py3-none-any.whl → 0.9.4py3-none-any.whl