PyPI - corp-extractor - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl - Mend

corp-extractor 0.9.0py3-none-any.whl → 0.9.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
statement_extractor/cli.py +1317 -101
statement_extractor/database/embeddings.py +45 -0
statement_extractor/database/hub.py +86 -136
statement_extractor/database/importers/__init__.py +10 -2
statement_extractor/database/importers/companies_house.py +16 -2
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +23 -0
statement_extractor/database/importers/import_utils.py +264 -0
statement_extractor/database/importers/sec_edgar.py +17 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +151 -43
statement_extractor/database/importers/wikidata_dump.py +2282 -0
statement_extractor/database/importers/wikidata_people.py +867 -325
statement_extractor/database/migrate_v2.py +852 -0
statement_extractor/database/models.py +155 -7
statement_extractor/database/schema_v2.py +409 -0
statement_extractor/database/seed_data.py +359 -0
statement_extractor/database/store.py +3449 -233
statement_extractor/document/deduplicator.py +10 -12
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +3 -2
statement_extractor/models/statement.py +15 -17
statement_extractor/models.py +1 -1
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +12 -12
statement_extractor/plugins/base.py +17 -17
statement_extractor/plugins/extractors/gliner2.py +28 -28
statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
statement_extractor/plugins/qualifiers/person.py +120 -53
statement_extractor/plugins/splitters/t5_gemma.py +35 -39
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/qualifiers/person.py CHANGED Viewed

@@ -9,7 +9,6 @@ Then searches the person database to find canonical matches for notable people
 (those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
 """
-import json
 import logging
 import re
 from typing import Optional
@@ -44,11 +43,12 @@ Candidates from database (with Wikipedia info):
 Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
-Rules:
-- The match should refer to the same person
-- Consider whether the role and organization from the text match the Wikipedia info
-- Different people with similar names should NOT match
-- If the extracted name is too generic or ambiguous, respond "NONE"
+IMPORTANT RULES:
+1. The candidate name must closely match the extracted name "{query_name}"
+2. Similar-sounding names are NOT matches (e.g., "Andy Vassies" does NOT match "Andy Jassy")
+3. If no candidate has a name that matches "{query_name}", respond "NONE"
+4. Consider role and organization context only AFTER confirming name match
+5. When in doubt, prefer "NONE" over a wrong match
 Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
 """
@@ -260,7 +260,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
             if result and (result.role or result.org):
                 qualifiers = result
-        # Fallback to pattern matching
+        # Fallback to pattern matching (only if LLM extraction returned nothing)
         if qualifiers is None:
             qualifiers = self._extract_with_patterns(entity.text, full_text)
@@ -313,42 +313,79 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
         if embedder is None:
             return None
-        # Embed the person name
-        logger.debug(f"    Embedding person name: '{person_name}'")
-        query_embedding = embedder.embed(person_name)
+        # Log extracted context
+        logger.debug(f"    Person search context:")
+        logger.debug(f"      Name: '{person_name}'")
+        logger.debug(f"      Extracted role: {extracted_role or '(none)'}")
+        logger.debug(f"      Extracted org: {extracted_org or '(none)'}")
-        # Search database with text pre-filtering
+        # Build query text with context for better embedding match
+        # This matches how PersonRecord.get_embedding_text() builds embedding text
+        query_parts = [person_name]
+        if extracted_role:
+            query_parts.append(extracted_role)
+        if extracted_org:
+            query_parts.append(extracted_org)
+        query_text = " | ".join(query_parts)
+        logger.debug(f"    Embedding query: '{query_text}'")
+        query_embedding = embedder.embed(query_text)
+        # Search database with text pre-filtering on name only
         logger.debug(f"    Searching person database...")
         results = database.search(
             query_embedding,
-            top_k=self._top_k,
+            top_k=self._top_k * 3,  # Fetch more to allow for org filtering
             query_text=person_name,
         )
+        logger.debug(f"    Database returned {len(results)} raw results")
+        # If org was extracted, boost candidates that match the org
+        if extracted_org:
+            # Re-score with org preference
+            org_matched = []
+            org_unmatched = []
+            for record, sim in results:
+                if record.known_for_org and self._org_matches(extracted_org, record.known_for_org):
+                    logger.debug(f"      Org match: {record.name} at {record.known_for_org}")
+                    org_matched.append((record, sim))
+                else:
+                    org_unmatched.append((record, sim))
+            # Prioritize org matches
+            if org_matched:
+                logger.info(f"    Found {len(org_matched)} candidates matching org '{extracted_org}'")
+                results = org_matched + org_unmatched
+            else:
+                logger.debug(f"    No candidates match org '{extracted_org}'")
         # Filter by minimum similarity
         results = [(r, s) for r, s in results if s >= self._min_similarity]
+        logger.debug(f"    After min_similarity filter ({self._min_similarity}): {len(results)} results")
         if not results:
             logger.debug(f"    No person matches found above threshold {self._min_similarity}")
             return None
-        # Boost scores based on role/org matching
+        # Boost scores based on name/role/org matching
         scored_results = []
         for record, similarity in results:
             boosted_score = self._compute_match_score(
-                record, similarity, extracted_role, extracted_org
+                record, similarity, extracted_role, extracted_org, query_name=person_name
             )
             scored_results.append((record, similarity, boosted_score))
         # Sort by boosted score
         scored_results.sort(key=lambda x: x[2], reverse=True)
-        # Log top candidates
+        # Log top candidates with detailed context
         logger.info(f"    Found {len(scored_results)} candidates for '{person_name}':")
         for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
             role_str = f" ({record.known_for_role})" if record.known_for_role else ""
             org_str = f" at {record.known_for_org}" if record.known_for_org else ""
-            logger.info(f"      {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
+            boost_delta = boosted - sim
+            boost_info = f" [+{boost_delta:.3f} boost]" if boost_delta > 0 else ""
+            logger.info(f"      {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f}{boost_info})")
         # Select best match using LLM if available
         logger.info(f"    Selecting best match (LLM={self._llm is not None})...")
@@ -373,6 +410,9 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
                 "similarity": similarity,
                 "known_for_role": record.known_for_role,
                 "known_for_org": record.known_for_org,
+                "birth_date": record.birth_date,
+                "death_date": record.death_date,
+                "is_historic": record.is_historic,
             },
         )
@@ -382,6 +422,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
         embedding_similarity: float,
         extracted_role: Optional[str],
         extracted_org: Optional[str],
+        query_name: Optional[str] = None,
     ) -> float:
         """
         Compute boosted match score using role/org context.
@@ -390,6 +431,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
         """
         score = embedding_similarity
+        # Major boost for exact name match (normalized)
+        if query_name:
+            query_norm = self._normalize_person_name(query_name)
+            record_norm = self._normalize_person_name(record.name)
+            if query_norm == record_norm:
+                score += 0.25  # +25% boost for exact name match
+                logger.debug(f"      Exact name match boost: '{query_name}' == '{record.name}'")
         # Boost if role matches (fuzzy)
         if extracted_role and record.known_for_role:
             if self._role_matches(extracted_role, record.known_for_role):
@@ -455,6 +504,18 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
         return False
+    def _normalize_person_name(self, name: str) -> str:
+        """Normalize person name for comparison."""
+        # Lowercase and strip
+        normalized = name.lower().strip()
+        # Remove common titles
+        for title in ["dr.", "dr ", "mr.", "mr ", "mrs.", "mrs ", "ms.", "ms ", "prof.", "prof "]:
+            if normalized.startswith(title):
+                normalized = normalized[len(title):]
+        # Remove extra whitespace
+        normalized = " ".join(normalized.split())
+        return normalized
     def _normalize_org_name(self, name: str) -> str:
         """Simple org name normalization."""
         # Lowercase
@@ -524,8 +585,15 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
             role_str = f", {record.known_for_role}" if record.known_for_role else ""
             org_str = f" at {record.known_for_org}" if record.known_for_org else ""
             country_str = f", {record.country}" if record.country else ""
+            # Include life dates for context (helps identify historic figures)
+            dates_parts = []
+            if record.birth_date:
+                dates_parts.append(f"b. {record.birth_date[:4]}")  # Just year
+            if record.death_date:
+                dates_parts.append(f"d. {record.death_date[:4]}")  # Just year
+            dates_str = f" [{' - '.join(dates_parts)}]" if dates_parts else ""
             candidate_lines.append(
-                f"{i}. {record.name}{role_str}{org_str}{country_str} (score: {boosted:.2f})"
+                f"{i}. {record.name}{role_str}{org_str}{country_str}{dates_str} (score: {boosted:.2f})"
             )
         # Build context info from extracted role/org
@@ -582,52 +650,51 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
         person_name: str,
         context_text: str,
     ) -> Optional[EntityQualifiers]:
-        """Extract role and org using Gemma3."""
+        """Extract role and org using Gemma3 with simple line-based output."""
         if self._llm is None:
             return None
         try:
-            prompt = f"""Extract qualifiers for a person from the given context.
-Instructions:
-- "role" = job title or position (e.g., "CEO", "President", "Director")
-- "org" = company or organization name (e.g., "Amazon", "Apple Inc", "Microsoft")
-- These are DIFFERENT things: role is a job title, org is a company name
-- Return null for fields not mentioned in the context
-Return ONLY valid JSON:
-E.g.
-<context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
-<person>James</person>
-Should return:
+            prompt = f"""Extract info about "{person_name}" from the text below.
+Reply with exactly 3 lines:
+NAME: the person's full name
+ROLE: their job title (CEO, President, etc.) or NONE
+ORG: the company/organization name or NONE
-{{"role": "CEO", "org": "Big Ducks Quacking Inc"}}
+Text: {context_text[:500]}
----
+NAME:"""
-<context>{context_text}</context>
-<person>{person_name}</person>
-"""
-            logger.debug(f"LLM request: {prompt}")
+            logger.debug(f"LLM extraction prompt for '{person_name}'")
             response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
             logger.debug(f"LLM response: {response}")
-            # Extract JSON from response
-            json_match = re.search(r'\{[^}]+\}', response)
-            if json_match:
-                data = json.loads(json_match.group())
-                role = data.get("role")
-                org = data.get("org")
-                # Validate: role and org should be different (reject if same)
-                if role and org and role.lower() == org.lower():
-                    logger.debug(f"Rejected duplicate role/org: {role}")
-                    org = None  # Clear org if it's same as role
-                if role or org:
-                    return EntityQualifiers(role=role, org=org)
+            # Parse line-based response
+            lines = response.strip().split("\n")
+            name = None
+            role = None
+            org = None
+            for line in lines:
+                line = line.strip()
+                if line.startswith("NAME:"):
+                    name = line[5:].strip()
+                elif line.startswith("ROLE:"):
+                    val = line[5:].strip()
+                    if val.upper() != "NONE":
+                        role = val
+                elif line.startswith("ORG:"):
+                    val = line[4:].strip()
+                    if val.upper() != "NONE":
+                        org = val
+                # Handle case where first line is just the name (after our "NAME:" in prompt)
+                elif not name and line and not line.startswith(("ROLE", "ORG")):
+                    name = line
+            logger.debug(f"LLM extracted: name={name!r}, role={role!r}, org={org!r}")
+            if role or org:
+                return EntityQualifiers(role=role, org=org)
         except Exception as e:
             logger.exception(f"LLM extraction failed: {e}")

statement_extractor/plugins/splitters/t5_gemma.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
-T5GemmaSplitter - Stage 1 plugin that wraps the existing StatementExtractor.
+T5GemmaSplitter - Stage 1 plugin that splits text into atomic sentences.
-Uses T5-Gemma2 model with Diverse Beam Search to generate high-quality
-subject-predicate-object triples from text.
+Uses T5-Gemma2 model with Diverse Beam Search to split unstructured text
+into atomic statements that can be converted to triples in Stage 2.
 """
 import logging
@@ -12,7 +12,7 @@ from typing import Optional
 from ..base import BaseSplitterPlugin, PluginCapability
 from ...pipeline.context import PipelineContext
 from ...pipeline.registry import PluginRegistry
-from ...models import RawTriple
+from ...models import SplitSentence
 logger = logging.getLogger(__name__)
@@ -20,10 +20,11 @@ logger = logging.getLogger(__name__)
 @PluginRegistry.splitter
 class T5GemmaSplitter(BaseSplitterPlugin):
     """
-    Splitter plugin that uses T5-Gemma2 for triple extraction.
+    Splitter plugin that uses T5-Gemma2 to split text into atomic sentences.
-    Wraps the existing StatementExtractor from extractor.py to produce
-    RawTriple objects for the pipeline.
+    Uses the T5-Gemma2 model to identify and extract atomic statements
+    from unstructured text. Each sentence can be converted to a
+    subject-predicate-object triple in Stage 2.
     """
     def __init__(
@@ -65,7 +66,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
     @property
     def description(self) -> str:
-        return "T5-Gemma2 model for extracting triples using Diverse Beam Search"
+        return "T5-Gemma2 model for splitting text into atomic sentences"
     @property
     def model_vram_gb(self) -> float:
@@ -94,16 +95,16 @@ class T5GemmaSplitter(BaseSplitterPlugin):
         self,
         text: str,
         context: PipelineContext,
-    ) -> list[RawTriple]:
+    ) -> list[SplitSentence]:
         """
-        Split text into raw triples using T5-Gemma2.
+        Split text into atomic sentences using T5-Gemma2.
         Args:
             text: Input text to split
             context: Pipeline context
         Returns:
-            List of RawTriple objects
+            List of SplitSentence objects
         """
         logger.debug(f"T5GemmaSplitter processing {len(text)} chars")
@@ -129,19 +130,19 @@ class T5GemmaSplitter(BaseSplitterPlugin):
         extractor = self._get_extractor()
         xml_output = extractor.extract_as_xml(text, options)
-        # Parse XML to RawTriple objects
-        raw_triples = self._parse_xml_to_raw_triples(xml_output)
+        # Parse XML to SplitSentence objects
+        sentences = self._parse_xml_to_sentences(xml_output)
-        logger.info(f"T5GemmaSplitter produced {len(raw_triples)} raw triples")
-        return raw_triples
+        logger.info(f"T5GemmaSplitter produced {len(sentences)} sentences")
+        return sentences
     def split_batch(
         self,
         texts: list[str],
         context: PipelineContext,
-    ) -> list[list[RawTriple]]:
+    ) -> list[list[SplitSentence]]:
         """
-        Split multiple texts into atomic triples using batch processing.
+        Split multiple texts into atomic sentences using batch processing.
         Processes all texts through the T5-Gemma2 model in batches
         sized for optimal GPU utilization.
@@ -151,7 +152,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
             context: Pipeline context
         Returns:
-            List of RawTriple lists, one per input text
+            List of SplitSentence lists, one per input text
         """
         if not texts:
             return []
@@ -177,7 +178,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
         )
         extractor = self._get_extractor()
-        all_results: list[list[RawTriple]] = []
+        all_results: list[list[SplitSentence]] = []
         # Process in batches
         for i in range(0, len(texts), batch_size):
@@ -187,8 +188,8 @@ class T5GemmaSplitter(BaseSplitterPlugin):
             batch_results = self._process_batch(batch_texts, extractor, options)
             all_results.extend(batch_results)
-        total_triples = sum(len(r) for r in all_results)
-        logger.info(f"T5GemmaSplitter batch produced {total_triples} total triples from {len(texts)} texts")
+        total_sentences = sum(len(r) for r in all_results)
+        logger.info(f"T5GemmaSplitter batch produced {total_sentences} total sentences from {len(texts)} texts")
         return all_results
     def _process_batch(
@@ -196,7 +197,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
         texts: list[str],
         extractor,
         options,
-    ) -> list[list[RawTriple]]:
+    ) -> list[list[SplitSentence]]:
         """
         Process a batch of texts through the model.
@@ -249,7 +250,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
             )
         # Decode and parse each output
-        results: list[list[RawTriple]] = []
+        results: list[list[SplitSentence]] = []
         end_tag = "</statements>"
         for output in outputs:
@@ -260,33 +261,28 @@ class T5GemmaSplitter(BaseSplitterPlugin):
                 end_pos = decoded.find(end_tag) + len(end_tag)
                 decoded = decoded[:end_pos]
-            triples = self._parse_xml_to_raw_triples(decoded)
-            results.append(triples)
+            sentences = self._parse_xml_to_sentences(decoded)
+            results.append(sentences)
         return results
     # Regex pattern to extract <text> content from <stmt> blocks
     _STMT_TEXT_PATTERN = re.compile(r'<stmt>.*?<text>(.*?)</text>.*?</stmt>', re.DOTALL)
-    def _parse_xml_to_raw_triples(self, xml_output: str) -> list[RawTriple]:
-        """Extract source sentences from <stmt><text>...</text></stmt> blocks."""
-        raw_triples = []
+    def _parse_xml_to_sentences(self, xml_output: str) -> list[SplitSentence]:
+        """Extract atomic sentences from <stmt><text>...</text></stmt> blocks."""
+        sentences = []
         # Find all <text> content within <stmt> blocks
         text_matches = self._STMT_TEXT_PATTERN.findall(xml_output)
         logger.debug(f"Found {len(text_matches)} stmt text blocks via regex")
-        for source_text in text_matches:
-            source_text = source_text.strip()
-            if source_text:
-                raw_triples.append(RawTriple(
-                    subject_text="",
-                    predicate_text="",
-                    object_text="",
-                    source_sentence=source_text,
-                ))
-        return raw_triples
+        for sentence_text in text_matches:
+            sentence_text = sentence_text.strip()
+            if sentence_text:
+                sentences.append(SplitSentence(text=sentence_text))
+        return sentences
 # Allow importing without decorator for testing

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

corp-extractor 0.9.0py3-none-any.whl → 0.9.4py3-none-any.whl