PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
corp_extractor-0.9.3.dist-info/RECORD +79 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +2030 -24
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +428 -0
statement_extractor/database/importers/__init__.py +32 -0
statement_extractor/database/importers/companies_house.py +559 -0
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +561 -0
statement_extractor/database/importers/sec_edgar.py +392 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +1120 -0
statement_extractor/database/importers/wikidata_dump.py +1951 -0
statement_extractor/database/importers/wikidata_people.py +1130 -0
statement_extractor/database/models.py +254 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +3034 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +171 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +19 -3
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +39 -15
statement_extractor/models.py +1 -1
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +90 -121
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +348 -78
statement_extractor/plugins/extractors/gliner2.py +38 -28
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +588 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +176 -75
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0

statement_extractor/plugins/qualifiers/person.py CHANGED Viewed

@@ -1,9 +1,12 @@
 """
-PersonQualifierPlugin - Qualifies PERSON entities with role and organization.
+PersonQualifierPlugin - Qualifies PERSON entities with role, organization, and canonical ID.
 Uses Gemma3 12B (instruction-tuned) to extract:
 - role: Job title/position (e.g., "CEO", "President")
 - org: Organization/employer (e.g., "Apple Inc", "Microsoft")
+Then searches the person database to find canonical matches for notable people
+(those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
 """
 import json
@@ -14,19 +17,51 @@ from typing import Optional
 from ..base import BaseQualifierPlugin, PluginCapability
 from ...pipeline.context import PipelineContext
 from ...pipeline.registry import PluginRegistry
-from ...models import ExtractedEntity, EntityQualifiers, EntityType
+from ...models import (
+    ExtractedEntity,
+    EntityQualifiers,
+    EntityType,
+    QualifiedEntity,
+    CanonicalEntity,
+    CanonicalMatch,
+    ResolvedRole,
+    ResolvedOrganization,
+)
 from ...llm import LLM
 logger = logging.getLogger(__name__)
+# LLM prompt template for person matching confirmation
+PERSON_MATCH_PROMPT = """You are matching a person name extracted from text to a database of notable people.
+Extracted name: "{query_name}"
+Context from text: {context_info}
+Source text: "{source_preview}"
+Candidates from database (with Wikipedia info):
+{candidates}
+Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
+Rules:
+- The match should refer to the same person
+- Consider whether the role and organization from the text match the Wikipedia info
+- Different people with similar names should NOT match
+- If the extracted name is too generic or ambiguous, respond "NONE"
+Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
+"""
 @PluginRegistry.qualifier
 class PersonQualifierPlugin(BaseQualifierPlugin):
     """
     Qualifier plugin for PERSON entities.
     Uses Gemma3 12B to extract role and organization from context.
-    Falls back to pattern matching if model is not available.
+    Then searches the person database to find canonical matches for notable people.
+    Falls back to pattern matching if LLM is not available.
     """
     # Common role patterns for fallback
@@ -45,6 +80,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
         gguf_file: Optional[str] = None,
         use_llm: bool = True,
         use_4bit: bool = True,
+        use_database: bool = True,
+        db_path: Optional[str] = None,
+        top_k: int = 10,
+        min_similarity: float = 0.5,
+        auto_download_db: bool = True,
     ):
         """
         Initialize the person qualifier.
@@ -54,8 +94,19 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
             gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
             use_llm: Whether to use LLM
             use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
+            use_database: Whether to use person database for canonical matching
+            db_path: Path to database (auto-detects if None)
+            top_k: Number of candidates to retrieve from database
+            min_similarity: Minimum similarity threshold for database matches
+            auto_download_db: Whether to auto-download database from HuggingFace
         """
         self._use_llm = use_llm
+        self._use_database = use_database
+        self._db_path = db_path
+        self._top_k = top_k
+        self._min_similarity = min_similarity
+        self._auto_download_db = auto_download_db
         self._llm: Optional[LLM] = None
         if use_llm:
             self._llm = LLM(
@@ -64,6 +115,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
                 use_4bit=use_4bit,
             )
+        # Lazy-loaded components
+        self._database = None
+        self._embedder = None
+        self._cache: dict[str, Optional[CanonicalEntity]] = {}
     @property
     def name(self) -> str:
         return "person_qualifier"
@@ -74,14 +130,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
     @property
     def capabilities(self) -> PluginCapability:
-        caps = PluginCapability.NONE
+        caps = PluginCapability.CACHING
         if self._use_llm:
             caps |= PluginCapability.LLM_REQUIRED
         return caps
     @property
     def description(self) -> str:
-        return "Extracts role and organization for PERSON entities using Gemma3"
+        return "Extracts role and organization for PERSON entities, with optional database lookup for notable people"
     @property
     def supported_entity_types(self) -> set[EntityType]:
@@ -89,38 +145,447 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
     @property
     def provided_identifier_types(self) -> list[str]:
-        return []  # Provides qualifiers, not identifiers
+        return ["wikidata_id"]
+    def _get_database(self):
+        """Get or initialize the person database."""
+        if self._database is not None:
+            return self._database
+        if not self._use_database:
+            return None
+        try:
+            from ...database.store import get_person_database
+            from ...database.hub import get_database_path
+            # Find database path
+            db_path = self._db_path
+            if db_path is None:
+                db_path = get_database_path(auto_download=self._auto_download_db)
+            if db_path is None:
+                logger.warning("Person database not available. Skipping database qualification.")
+                return None
+            # Use singleton to ensure database is only loaded once
+            self._database = get_person_database(db_path=db_path)
+            logger.info(f"Loaded person database from {db_path}")
+            return self._database
+        except Exception as e:
+            logger.warning(f"Failed to load person database: {e}")
+            return None
+    def _get_embedder(self):
+        """Get or initialize the embedder."""
+        if self._embedder is not None:
+            return self._embedder
+        try:
+            from ...database import CompanyEmbedder
+            self._embedder = CompanyEmbedder()
+            return self._embedder
+        except Exception as e:
+            logger.warning(f"Failed to load embedder: {e}")
+            return None
+    def _get_org_resolver(self):
+        """Get or initialize the organization resolver."""
+        if not hasattr(self, '_org_resolver'):
+            self._org_resolver = None
+        if self._org_resolver is not None:
+            return self._org_resolver
+        try:
+            from ...database.resolver import get_organization_resolver
+            self._org_resolver = get_organization_resolver(
+                db_path=self._db_path,
+                auto_download_db=self._auto_download_db,
+            )
+            return self._org_resolver
+        except Exception as e:
+            logger.warning(f"Failed to initialize organization resolver: {e}")
+            return None
+    def _resolve_organization(self, org_name: str) -> Optional[ResolvedOrganization]:
+        """
+        Resolve an organization name against the organization database.
+        Uses the shared OrganizationResolver utility.
+        Args:
+            org_name: Organization name to resolve
+        Returns:
+            ResolvedOrganization if found, None otherwise
+        """
+        resolver = self._get_org_resolver()
+        if resolver is None:
+            return None
+        return resolver.resolve(org_name)
     def qualify(
         self,
         entity: ExtractedEntity,
         context: PipelineContext,
-    ) -> Optional[EntityQualifiers]:
+    ) -> Optional[CanonicalEntity]:
         """
-        Qualify a PERSON entity with role and organization.
+        Qualify a PERSON entity with role, organization, and optionally canonical ID.
         Args:
             entity: The PERSON entity to qualify
             context: Pipeline context for accessing source text
         Returns:
-            EntityQualifiers with role and org, or None if nothing found
+            CanonicalEntity with role/org qualifiers and FQN, or None if nothing found
         """
         if entity.type != EntityType.PERSON:
             return None
+        # Check cache
+        cache_key = entity.text.lower().strip()
+        if cache_key in self._cache:
+            return self._cache[cache_key]
         # Use the full source text for LLM qualification
-        # This provides maximum context for understanding the person's role/org
         full_text = context.source_text
-        # Try LLM extraction first with full text
+        # Step 1: Extract role and org using LLM or patterns
+        qualifiers: Optional[EntityQualifiers] = None
         if self._llm is not None:
             result = self._extract_with_llm(entity.text, full_text)
             if result and (result.role or result.org):
-                return result
+                qualifiers = result
+        # Fallback to pattern matching
+        if qualifiers is None:
+            qualifiers = self._extract_with_patterns(entity.text, full_text)
+        # Step 2: Search database for canonical match (if database is available)
+        canonical_match = None
+        if self._use_database:
+            canonical_match = self._search_database(
+                entity.text,
+                qualifiers.role if qualifiers else None,
+                qualifiers.org if qualifiers else None,
+                context,
+            )
-        # Fallback to pattern matching with full text
-        return self._extract_with_patterns(entity.text, full_text)
+        # If no qualifiers found and no database match, return None
+        if qualifiers is None and canonical_match is None:
+            self._cache[cache_key] = None
+            return None
+        # Step 3: Build CanonicalEntity
+        result = self._build_canonical_entity(entity, qualifiers, canonical_match)
+        self._cache[cache_key] = result
+        return result
+    def _search_database(
+        self,
+        person_name: str,
+        extracted_role: Optional[str],
+        extracted_org: Optional[str],
+        context: PipelineContext,
+    ) -> Optional[CanonicalMatch]:
+        """
+        Search the person database for a canonical match.
+        Uses embedding similarity + role/org matching for disambiguation.
+        Args:
+            person_name: Name of the person
+            extracted_role: Role extracted from text (e.g., "CEO")
+            extracted_org: Organization extracted from text (e.g., "Apple Inc")
+            context: Pipeline context
+        Returns:
+            CanonicalMatch if a confident match is found, None otherwise
+        """
+        database = self._get_database()
+        if database is None:
+            return None
+        embedder = self._get_embedder()
+        if embedder is None:
+            return None
+        # Embed the person name
+        logger.debug(f"    Embedding person name: '{person_name}'")
+        query_embedding = embedder.embed(person_name)
+        # Search database with text pre-filtering
+        logger.debug(f"    Searching person database...")
+        results = database.search(
+            query_embedding,
+            top_k=self._top_k,
+            query_text=person_name,
+        )
+        # Filter by minimum similarity
+        results = [(r, s) for r, s in results if s >= self._min_similarity]
+        if not results:
+            logger.debug(f"    No person matches found above threshold {self._min_similarity}")
+            return None
+        # Boost scores based on role/org matching
+        scored_results = []
+        for record, similarity in results:
+            boosted_score = self._compute_match_score(
+                record, similarity, extracted_role, extracted_org
+            )
+            scored_results.append((record, similarity, boosted_score))
+        # Sort by boosted score
+        scored_results.sort(key=lambda x: x[2], reverse=True)
+        # Log top candidates
+        logger.info(f"    Found {len(scored_results)} candidates for '{person_name}':")
+        for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
+            role_str = f" ({record.known_for_role})" if record.known_for_role else ""
+            org_str = f" at {record.known_for_org}" if record.known_for_org else ""
+            logger.info(f"      {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
+        # Select best match using LLM if available
+        logger.info(f"    Selecting best match (LLM={self._llm is not None})...")
+        best_match = self._select_best_match(person_name, scored_results, extracted_role, extracted_org, context)
+        if best_match is None:
+            logger.info(f"    No confident match for '{person_name}'")
+            return None
+        record, similarity, boosted = best_match
+        logger.info(f"    Matched: '{record.name}' (wikidata:{record.source_id}, similarity={similarity:.3f})")
+        # Build canonical match
+        return CanonicalMatch(
+            canonical_id=f"wikidata:{record.source_id}",
+            canonical_name=record.name,
+            match_method="embedding",
+            match_confidence=min(max(boosted, 0.0), 1.0),
+            match_details={
+                "source": "wikidata",
+                "source_id": record.source_id,
+                "similarity": similarity,
+                "known_for_role": record.known_for_role,
+                "known_for_org": record.known_for_org,
+                "birth_date": record.birth_date,
+                "death_date": record.death_date,
+                "is_historic": record.is_historic,
+            },
+        )
+    def _compute_match_score(
+        self,
+        record,
+        embedding_similarity: float,
+        extracted_role: Optional[str],
+        extracted_org: Optional[str],
+    ) -> float:
+        """
+        Compute boosted match score using role/org context.
+        Boosts similarity score if extracted role/org matches database record.
+        """
+        score = embedding_similarity
+        # Boost if role matches (fuzzy)
+        if extracted_role and record.known_for_role:
+            if self._role_matches(extracted_role, record.known_for_role):
+                score += 0.1  # +10% boost
+                logger.debug(f"      Role match boost: {extracted_role} ~ {record.known_for_role}")
+        # Boost if org matches (fuzzy)
+        if extracted_org and record.known_for_org:
+            if self._org_matches(extracted_org, record.known_for_org):
+                score += 0.15  # +15% boost (org is stronger signal)
+                logger.debug(f"      Org match boost: {extracted_org} ~ {record.known_for_org}")
+        return min(score, 1.0)  # Cap at 1.0
+    def _role_matches(self, extracted: str, known: str) -> bool:
+        """Fuzzy role matching."""
+        extracted_lower = extracted.lower().strip()
+        known_lower = known.lower().strip()
+        # Exact match
+        if extracted_lower == known_lower:
+            return True
+        # CEO variants
+        ceo_variants = {"ceo", "chief executive", "chief executive officer"}
+        if extracted_lower in ceo_variants and known_lower in ceo_variants:
+            return True
+        # CFO variants
+        cfo_variants = {"cfo", "chief financial officer"}
+        if extracted_lower in cfo_variants and known_lower in cfo_variants:
+            return True
+        # President variants
+        president_variants = {"president", "chairman", "chairman and ceo"}
+        if extracted_lower in president_variants and known_lower in president_variants:
+            return True
+        # Founder variants
+        founder_variants = {"founder", "co-founder", "cofounder", "founding member"}
+        if extracted_lower in founder_variants and known_lower in founder_variants:
+            return True
+        # Contains check for partial matches
+        if extracted_lower in known_lower or known_lower in extracted_lower:
+            return True
+        return False
+    def _org_matches(self, extracted: str, known: str) -> bool:
+        """Fuzzy org matching using simple normalization."""
+        # Normalize both
+        extracted_norm = self._normalize_org_name(extracted)
+        known_norm = self._normalize_org_name(known)
+        # Exact normalized match
+        if extracted_norm == known_norm:
+            return True
+        # Check if one contains the other (e.g., "Apple" in "Apple Inc")
+        if extracted_norm in known_norm or known_norm in extracted_norm:
+            return True
+        return False
+    def _normalize_org_name(self, name: str) -> str:
+        """Simple org name normalization."""
+        # Lowercase
+        normalized = name.lower().strip()
+        # Remove common suffixes
+        suffixes = [
+            " inc.", " inc", " corp.", " corp", " corporation",
+            " ltd.", " ltd", " limited", " llc", " plc",
+            " co.", " co", " company",
+        ]
+        for suffix in suffixes:
+            if normalized.endswith(suffix):
+                normalized = normalized[:-len(suffix)]
+        return normalized.strip()
+    def _select_best_match(
+        self,
+        query_name: str,
+        candidates: list[tuple],
+        extracted_role: Optional[str],
+        extracted_org: Optional[str],
+        context: PipelineContext,
+    ) -> Optional[tuple]:
+        """
+        Select the best match from candidates.
+        Uses LLM if available, otherwise returns top match if confidence is high enough.
+        """
+        if not candidates:
+            return None
+        # If only one strong match, use it directly
+        if len(candidates) == 1 and candidates[0][2] >= 0.9:
+            logger.info(f"    Single strong match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
+            return candidates[0]
+        # Try LLM confirmation
+        if self._llm is not None:
+            try:
+                return self._llm_select_match(query_name, candidates, extracted_role, extracted_org, context)
+            except Exception as e:
+                logger.warning(f"    LLM confirmation failed: {e}")
+        # Fallback: use top match if boosted score is high enough
+        top_record, top_similarity, top_boosted = candidates[0]
+        if top_boosted >= 0.85:
+            logger.info(f"    No LLM, using top match: '{top_record.name}' (boosted={top_boosted:.3f})")
+            return candidates[0]
+        logger.info(f"    No confident match (top boosted={top_boosted:.3f} < 0.85)")
+        return None
+    def _llm_select_match(
+        self,
+        query_name: str,
+        candidates: list[tuple],
+        extracted_role: Optional[str],
+        extracted_org: Optional[str],
+        context: PipelineContext,
+    ) -> Optional[tuple]:
+        """Use LLM to select the best match."""
+        # Format candidates for prompt
+        candidate_lines = []
+        for i, (record, similarity, boosted) in enumerate(candidates[:10], 1):
+            role_str = f", {record.known_for_role}" if record.known_for_role else ""
+            org_str = f" at {record.known_for_org}" if record.known_for_org else ""
+            country_str = f", {record.country}" if record.country else ""
+            # Include life dates for context (helps identify historic figures)
+            dates_parts = []
+            if record.birth_date:
+                dates_parts.append(f"b. {record.birth_date[:4]}")  # Just year
+            if record.death_date:
+                dates_parts.append(f"d. {record.death_date[:4]}")  # Just year
+            dates_str = f" [{' - '.join(dates_parts)}]" if dates_parts else ""
+            candidate_lines.append(
+                f"{i}. {record.name}{role_str}{org_str}{country_str}{dates_str} (score: {boosted:.2f})"
+            )
+        # Build context info from extracted role/org
+        context_parts = []
+        if extracted_role:
+            context_parts.append(f"role={extracted_role}")
+        if extracted_org:
+            context_parts.append(f"org={extracted_org}")
+        context_info = ", ".join(context_parts) if context_parts else "no role/org extracted"
+        # Source text preview
+        source_preview = ""
+        if context.source_text:
+            source_preview = context.source_text[:300] + "..." if len(context.source_text) > 300 else context.source_text
+        prompt = PERSON_MATCH_PROMPT.format(
+            query_name=query_name,
+            context_info=context_info,
+            source_preview=source_preview,
+            candidates="\n".join(candidate_lines),
+        )
+        # Get LLM response
+        assert self._llm is not None
+        response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
+        response = response.strip()
+        logger.info(f"    LLM response for '{query_name}': {response}")
+        # Parse response
+        if response.upper() == "NONE":
+            logger.info(f"    LLM chose: NONE (no match)")
+            return None
+        try:
+            idx = int(response) - 1
+            if 0 <= idx < len(candidates):
+                chosen = candidates[idx]
+                logger.info(f"    LLM chose: #{idx + 1} '{chosen[0].name}' (boosted={chosen[2]:.3f})")
+                return chosen
+        except ValueError:
+            logger.warning(f"    LLM response '{response}' could not be parsed as number")
+        # Fallback to top match if LLM response is unclear and score is decent
+        if candidates[0][2] >= 0.8:
+            logger.info(f"    Fallback to top match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
+            return candidates[0]
+        logger.info(f"    No confident match (top boosted={candidates[0][2]:.3f} < 0.8)")
+        return None
     def _extract_with_llm(
         self,
@@ -216,6 +681,115 @@ Should return:
         return None
+    def _build_canonical_entity(
+        self,
+        entity: ExtractedEntity,
+        qualifiers: Optional[EntityQualifiers],
+        canonical_match: Optional[CanonicalMatch],
+    ) -> CanonicalEntity:
+        """Build CanonicalEntity from qualifiers and optional canonical match."""
+        # Ensure qualifiers is not None
+        if qualifiers is None:
+            qualifiers = EntityQualifiers()
+        # If we have a canonical match, add wikidata ID to identifiers
+        identifiers: dict[str, str] = dict(qualifiers.identifiers) if qualifiers.identifiers else {}
+        resolved_role: Optional[ResolvedRole] = None
+        resolved_org: Optional[ResolvedOrganization] = None
+        if canonical_match:
+            match_details = canonical_match.match_details or {}
+            source_id = str(match_details.get("source_id", ""))
+            if source_id:
+                identifiers["wikidata_id"] = source_id
+            if canonical_match.canonical_id:
+                identifiers["canonical_id"] = canonical_match.canonical_id
+            # Extract role and org from database match
+            known_role = str(match_details.get("known_for_role", "") or "")
+            known_org = str(match_details.get("known_for_org", "") or "")
+            # Create ResolvedRole from database match
+            if known_role:
+                resolved_role = ResolvedRole(
+                    canonical_name=known_role,
+                    canonical_id=None,  # Role ID would need separate lookup
+                    source="wikidata",
+                    source_id=source_id if source_id else None,
+                )
+            # Update qualifiers with info from database if not already set
+            if not qualifiers.role and known_role:
+                final_role = known_role
+                final_org = qualifiers.org or known_org or None
+            else:
+                final_role = qualifiers.role
+                final_org = qualifiers.org
+        else:
+            final_role = qualifiers.role
+            final_org = qualifiers.org
+        # Resolve organization against the organization database
+        org_to_resolve = final_org
+        if org_to_resolve:
+            logger.debug(f"    Resolving organization: '{org_to_resolve}'")
+            resolved_org = self._resolve_organization(org_to_resolve)
+            if resolved_org:
+                logger.info(f"    Resolved org: '{org_to_resolve}' -> '{resolved_org.canonical_name}' ({resolved_org.canonical_id})")
+        # Build the final qualifiers with resolved info
+        qualifiers = EntityQualifiers(
+            role=final_role,
+            org=final_org,
+            identifiers=identifiers,
+            resolved_role=resolved_role,
+            resolved_org=resolved_org,
+        )
+        # Create QualifiedEntity
+        qualified = QualifiedEntity(
+            entity_ref=entity.entity_ref,
+            original_text=entity.text,
+            entity_type=entity.type,
+            qualifiers=qualifiers,
+            qualification_sources=[self.name],
+        )
+        # Build FQN - prefer resolved names when available
+        if canonical_match and canonical_match.canonical_name:
+            # Use canonical person name from database
+            fqn_parts: list[str] = [canonical_match.canonical_name]
+            if qualifiers.role:
+                fqn_parts.append(f"({qualifiers.role})")
+            # Use resolved org name if available
+            if resolved_org:
+                fqn_parts.append(f"at {resolved_org.canonical_name}")
+            elif qualifiers.org:
+                fqn_parts.append(f"at {qualifiers.org}")
+            fqn = " ".join(fqn_parts)
+        else:
+            # Build FQN: "Person Name (Role, Org)" or "Person Name (Role)" or "Person Name (Org)"
+            fqn_parts_for_display: list[str] = []
+            if qualifiers.role:
+                fqn_parts_for_display.append(qualifiers.role)
+            # Use resolved org name if available
+            if resolved_org:
+                fqn_parts_for_display.append(resolved_org.canonical_name)
+            elif qualifiers.org:
+                fqn_parts_for_display.append(qualifiers.org)
+            if fqn_parts_for_display:
+                fqn = f"{entity.text} ({', '.join(fqn_parts_for_display)})"
+            else:
+                fqn = entity.text
+        return CanonicalEntity(
+            entity_ref=entity.entity_ref,
+            qualified_entity=qualified,
+            canonical_match=canonical_match,
+            fqn=fqn,
+        )
 # Allow importing without decorator for testing
 PersonQualifierPluginClass = PersonQualifierPlugin

corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl