corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -1,9 +1,12 @@
1
1
  """
2
- PersonQualifierPlugin - Qualifies PERSON entities with role and organization.
2
+ PersonQualifierPlugin - Qualifies PERSON entities with role, organization, and canonical ID.
3
3
 
4
4
  Uses Gemma3 12B (instruction-tuned) to extract:
5
5
  - role: Job title/position (e.g., "CEO", "President")
6
6
  - org: Organization/employer (e.g., "Apple Inc", "Microsoft")
7
+
8
+ Then searches the person database to find canonical matches for notable people
9
+ (those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
7
10
  """
8
11
 
9
12
  import json
@@ -14,19 +17,51 @@ from typing import Optional
14
17
  from ..base import BaseQualifierPlugin, PluginCapability
15
18
  from ...pipeline.context import PipelineContext
16
19
  from ...pipeline.registry import PluginRegistry
17
- from ...models import ExtractedEntity, EntityQualifiers, EntityType
20
+ from ...models import (
21
+ ExtractedEntity,
22
+ EntityQualifiers,
23
+ EntityType,
24
+ QualifiedEntity,
25
+ CanonicalEntity,
26
+ CanonicalMatch,
27
+ ResolvedRole,
28
+ ResolvedOrganization,
29
+ )
18
30
  from ...llm import LLM
19
31
 
20
32
  logger = logging.getLogger(__name__)
21
33
 
22
34
 
35
+ # LLM prompt template for person matching confirmation
36
+ PERSON_MATCH_PROMPT = """You are matching a person name extracted from text to a database of notable people.
37
+
38
+ Extracted name: "{query_name}"
39
+ Context from text: {context_info}
40
+ Source text: "{source_preview}"
41
+
42
+ Candidates from database (with Wikipedia info):
43
+ {candidates}
44
+
45
+ Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
46
+
47
+ Rules:
48
+ - The match should refer to the same person
49
+ - Consider whether the role and organization from the text match the Wikipedia info
50
+ - Different people with similar names should NOT match
51
+ - If the extracted name is too generic or ambiguous, respond "NONE"
52
+
53
+ Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
54
+ """
55
+
56
+
23
57
  @PluginRegistry.qualifier
24
58
  class PersonQualifierPlugin(BaseQualifierPlugin):
25
59
  """
26
60
  Qualifier plugin for PERSON entities.
27
61
 
28
62
  Uses Gemma3 12B to extract role and organization from context.
29
- Falls back to pattern matching if model is not available.
63
+ Then searches the person database to find canonical matches for notable people.
64
+ Falls back to pattern matching if LLM is not available.
30
65
  """
31
66
 
32
67
  # Common role patterns for fallback
@@ -45,6 +80,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
45
80
  gguf_file: Optional[str] = None,
46
81
  use_llm: bool = True,
47
82
  use_4bit: bool = True,
83
+ use_database: bool = True,
84
+ db_path: Optional[str] = None,
85
+ top_k: int = 10,
86
+ min_similarity: float = 0.5,
87
+ auto_download_db: bool = True,
48
88
  ):
49
89
  """
50
90
  Initialize the person qualifier.
@@ -54,8 +94,19 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
54
94
  gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
55
95
  use_llm: Whether to use LLM
56
96
  use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
97
+ use_database: Whether to use person database for canonical matching
98
+ db_path: Path to database (auto-detects if None)
99
+ top_k: Number of candidates to retrieve from database
100
+ min_similarity: Minimum similarity threshold for database matches
101
+ auto_download_db: Whether to auto-download database from HuggingFace
57
102
  """
58
103
  self._use_llm = use_llm
104
+ self._use_database = use_database
105
+ self._db_path = db_path
106
+ self._top_k = top_k
107
+ self._min_similarity = min_similarity
108
+ self._auto_download_db = auto_download_db
109
+
59
110
  self._llm: Optional[LLM] = None
60
111
  if use_llm:
61
112
  self._llm = LLM(
@@ -64,6 +115,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
64
115
  use_4bit=use_4bit,
65
116
  )
66
117
 
118
+ # Lazy-loaded components
119
+ self._database = None
120
+ self._embedder = None
121
+ self._cache: dict[str, Optional[CanonicalEntity]] = {}
122
+
67
123
  @property
68
124
  def name(self) -> str:
69
125
  return "person_qualifier"
@@ -74,14 +130,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
74
130
 
75
131
  @property
76
132
  def capabilities(self) -> PluginCapability:
77
- caps = PluginCapability.NONE
133
+ caps = PluginCapability.CACHING
78
134
  if self._use_llm:
79
135
  caps |= PluginCapability.LLM_REQUIRED
80
136
  return caps
81
137
 
82
138
  @property
83
139
  def description(self) -> str:
84
- return "Extracts role and organization for PERSON entities using Gemma3"
140
+ return "Extracts role and organization for PERSON entities, with optional database lookup for notable people"
85
141
 
86
142
  @property
87
143
  def supported_entity_types(self) -> set[EntityType]:
@@ -89,38 +145,437 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
89
145
 
90
146
  @property
91
147
  def provided_identifier_types(self) -> list[str]:
92
- return [] # Provides qualifiers, not identifiers
148
+ return ["wikidata_id"]
149
+
150
+ def _get_database(self):
151
+ """Get or initialize the person database."""
152
+ if self._database is not None:
153
+ return self._database
154
+
155
+ if not self._use_database:
156
+ return None
157
+
158
+ try:
159
+ from ...database.store import get_person_database
160
+ from ...database.hub import get_database_path
161
+
162
+ # Find database path
163
+ db_path = self._db_path
164
+ if db_path is None:
165
+ db_path = get_database_path(auto_download=self._auto_download_db)
166
+
167
+ if db_path is None:
168
+ logger.warning("Person database not available. Skipping database qualification.")
169
+ return None
170
+
171
+ # Use singleton to ensure database is only loaded once
172
+ self._database = get_person_database(db_path=db_path)
173
+ logger.info(f"Loaded person database from {db_path}")
174
+ return self._database
175
+
176
+ except Exception as e:
177
+ logger.warning(f"Failed to load person database: {e}")
178
+ return None
179
+
180
+ def _get_embedder(self):
181
+ """Get or initialize the embedder."""
182
+ if self._embedder is not None:
183
+ return self._embedder
184
+
185
+ try:
186
+ from ...database import CompanyEmbedder
187
+ self._embedder = CompanyEmbedder()
188
+ return self._embedder
189
+ except Exception as e:
190
+ logger.warning(f"Failed to load embedder: {e}")
191
+ return None
192
+
193
+ def _get_org_resolver(self):
194
+ """Get or initialize the organization resolver."""
195
+ if not hasattr(self, '_org_resolver'):
196
+ self._org_resolver = None
197
+
198
+ if self._org_resolver is not None:
199
+ return self._org_resolver
200
+
201
+ try:
202
+ from ...database.resolver import get_organization_resolver
203
+ self._org_resolver = get_organization_resolver(
204
+ db_path=self._db_path,
205
+ auto_download_db=self._auto_download_db,
206
+ )
207
+ return self._org_resolver
208
+ except Exception as e:
209
+ logger.warning(f"Failed to initialize organization resolver: {e}")
210
+ return None
211
+
212
+ def _resolve_organization(self, org_name: str) -> Optional[ResolvedOrganization]:
213
+ """
214
+ Resolve an organization name against the organization database.
215
+
216
+ Uses the shared OrganizationResolver utility.
217
+
218
+ Args:
219
+ org_name: Organization name to resolve
220
+
221
+ Returns:
222
+ ResolvedOrganization if found, None otherwise
223
+ """
224
+ resolver = self._get_org_resolver()
225
+ if resolver is None:
226
+ return None
227
+
228
+ return resolver.resolve(org_name)
93
229
 
94
230
  def qualify(
95
231
  self,
96
232
  entity: ExtractedEntity,
97
233
  context: PipelineContext,
98
- ) -> Optional[EntityQualifiers]:
234
+ ) -> Optional[CanonicalEntity]:
99
235
  """
100
- Qualify a PERSON entity with role and organization.
236
+ Qualify a PERSON entity with role, organization, and optionally canonical ID.
101
237
 
102
238
  Args:
103
239
  entity: The PERSON entity to qualify
104
240
  context: Pipeline context for accessing source text
105
241
 
106
242
  Returns:
107
- EntityQualifiers with role and org, or None if nothing found
243
+ CanonicalEntity with role/org qualifiers and FQN, or None if nothing found
108
244
  """
109
245
  if entity.type != EntityType.PERSON:
110
246
  return None
111
247
 
248
+ # Check cache
249
+ cache_key = entity.text.lower().strip()
250
+ if cache_key in self._cache:
251
+ return self._cache[cache_key]
252
+
112
253
  # Use the full source text for LLM qualification
113
- # This provides maximum context for understanding the person's role/org
114
254
  full_text = context.source_text
115
255
 
116
- # Try LLM extraction first with full text
256
+ # Step 1: Extract role and org using LLM or patterns
257
+ qualifiers: Optional[EntityQualifiers] = None
117
258
  if self._llm is not None:
118
259
  result = self._extract_with_llm(entity.text, full_text)
119
260
  if result and (result.role or result.org):
120
- return result
261
+ qualifiers = result
262
+
263
+ # Fallback to pattern matching
264
+ if qualifiers is None:
265
+ qualifiers = self._extract_with_patterns(entity.text, full_text)
266
+
267
+ # Step 2: Search database for canonical match (if database is available)
268
+ canonical_match = None
269
+ if self._use_database:
270
+ canonical_match = self._search_database(
271
+ entity.text,
272
+ qualifiers.role if qualifiers else None,
273
+ qualifiers.org if qualifiers else None,
274
+ context,
275
+ )
121
276
 
122
- # Fallback to pattern matching with full text
123
- return self._extract_with_patterns(entity.text, full_text)
277
+ # If no qualifiers found and no database match, return None
278
+ if qualifiers is None and canonical_match is None:
279
+ self._cache[cache_key] = None
280
+ return None
281
+
282
+ # Step 3: Build CanonicalEntity
283
+ result = self._build_canonical_entity(entity, qualifiers, canonical_match)
284
+ self._cache[cache_key] = result
285
+ return result
286
+
287
+ def _search_database(
288
+ self,
289
+ person_name: str,
290
+ extracted_role: Optional[str],
291
+ extracted_org: Optional[str],
292
+ context: PipelineContext,
293
+ ) -> Optional[CanonicalMatch]:
294
+ """
295
+ Search the person database for a canonical match.
296
+
297
+ Uses embedding similarity + role/org matching for disambiguation.
298
+
299
+ Args:
300
+ person_name: Name of the person
301
+ extracted_role: Role extracted from text (e.g., "CEO")
302
+ extracted_org: Organization extracted from text (e.g., "Apple Inc")
303
+ context: Pipeline context
304
+
305
+ Returns:
306
+ CanonicalMatch if a confident match is found, None otherwise
307
+ """
308
+ database = self._get_database()
309
+ if database is None:
310
+ return None
311
+
312
+ embedder = self._get_embedder()
313
+ if embedder is None:
314
+ return None
315
+
316
+ # Embed the person name
317
+ logger.debug(f" Embedding person name: '{person_name}'")
318
+ query_embedding = embedder.embed(person_name)
319
+
320
+ # Search database with text pre-filtering
321
+ logger.debug(f" Searching person database...")
322
+ results = database.search(
323
+ query_embedding,
324
+ top_k=self._top_k,
325
+ query_text=person_name,
326
+ )
327
+
328
+ # Filter by minimum similarity
329
+ results = [(r, s) for r, s in results if s >= self._min_similarity]
330
+
331
+ if not results:
332
+ logger.debug(f" No person matches found above threshold {self._min_similarity}")
333
+ return None
334
+
335
+ # Boost scores based on role/org matching
336
+ scored_results = []
337
+ for record, similarity in results:
338
+ boosted_score = self._compute_match_score(
339
+ record, similarity, extracted_role, extracted_org
340
+ )
341
+ scored_results.append((record, similarity, boosted_score))
342
+
343
+ # Sort by boosted score
344
+ scored_results.sort(key=lambda x: x[2], reverse=True)
345
+
346
+ # Log top candidates
347
+ logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
348
+ for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
349
+ role_str = f" ({record.known_for_role})" if record.known_for_role else ""
350
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
351
+ logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
352
+
353
+ # Select best match using LLM if available
354
+ logger.info(f" Selecting best match (LLM={self._llm is not None})...")
355
+ best_match = self._select_best_match(person_name, scored_results, extracted_role, extracted_org, context)
356
+
357
+ if best_match is None:
358
+ logger.info(f" No confident match for '{person_name}'")
359
+ return None
360
+
361
+ record, similarity, boosted = best_match
362
+ logger.info(f" Matched: '{record.name}' (wikidata:{record.source_id}, similarity={similarity:.3f})")
363
+
364
+ # Build canonical match
365
+ return CanonicalMatch(
366
+ canonical_id=f"wikidata:{record.source_id}",
367
+ canonical_name=record.name,
368
+ match_method="embedding",
369
+ match_confidence=min(max(boosted, 0.0), 1.0),
370
+ match_details={
371
+ "source": "wikidata",
372
+ "source_id": record.source_id,
373
+ "similarity": similarity,
374
+ "known_for_role": record.known_for_role,
375
+ "known_for_org": record.known_for_org,
376
+ },
377
+ )
378
+
379
+ def _compute_match_score(
380
+ self,
381
+ record,
382
+ embedding_similarity: float,
383
+ extracted_role: Optional[str],
384
+ extracted_org: Optional[str],
385
+ ) -> float:
386
+ """
387
+ Compute boosted match score using role/org context.
388
+
389
+ Boosts similarity score if extracted role/org matches database record.
390
+ """
391
+ score = embedding_similarity
392
+
393
+ # Boost if role matches (fuzzy)
394
+ if extracted_role and record.known_for_role:
395
+ if self._role_matches(extracted_role, record.known_for_role):
396
+ score += 0.1 # +10% boost
397
+ logger.debug(f" Role match boost: {extracted_role} ~ {record.known_for_role}")
398
+
399
+ # Boost if org matches (fuzzy)
400
+ if extracted_org and record.known_for_org:
401
+ if self._org_matches(extracted_org, record.known_for_org):
402
+ score += 0.15 # +15% boost (org is stronger signal)
403
+ logger.debug(f" Org match boost: {extracted_org} ~ {record.known_for_org}")
404
+
405
+ return min(score, 1.0) # Cap at 1.0
406
+
407
+ def _role_matches(self, extracted: str, known: str) -> bool:
408
+ """Fuzzy role matching."""
409
+ extracted_lower = extracted.lower().strip()
410
+ known_lower = known.lower().strip()
411
+
412
+ # Exact match
413
+ if extracted_lower == known_lower:
414
+ return True
415
+
416
+ # CEO variants
417
+ ceo_variants = {"ceo", "chief executive", "chief executive officer"}
418
+ if extracted_lower in ceo_variants and known_lower in ceo_variants:
419
+ return True
420
+
421
+ # CFO variants
422
+ cfo_variants = {"cfo", "chief financial officer"}
423
+ if extracted_lower in cfo_variants and known_lower in cfo_variants:
424
+ return True
425
+
426
+ # President variants
427
+ president_variants = {"president", "chairman", "chairman and ceo"}
428
+ if extracted_lower in president_variants and known_lower in president_variants:
429
+ return True
430
+
431
+ # Founder variants
432
+ founder_variants = {"founder", "co-founder", "cofounder", "founding member"}
433
+ if extracted_lower in founder_variants and known_lower in founder_variants:
434
+ return True
435
+
436
+ # Contains check for partial matches
437
+ if extracted_lower in known_lower or known_lower in extracted_lower:
438
+ return True
439
+
440
+ return False
441
+
442
+ def _org_matches(self, extracted: str, known: str) -> bool:
443
+ """Fuzzy org matching using simple normalization."""
444
+ # Normalize both
445
+ extracted_norm = self._normalize_org_name(extracted)
446
+ known_norm = self._normalize_org_name(known)
447
+
448
+ # Exact normalized match
449
+ if extracted_norm == known_norm:
450
+ return True
451
+
452
+ # Check if one contains the other (e.g., "Apple" in "Apple Inc")
453
+ if extracted_norm in known_norm or known_norm in extracted_norm:
454
+ return True
455
+
456
+ return False
457
+
458
+ def _normalize_org_name(self, name: str) -> str:
459
+ """Simple org name normalization."""
460
+ # Lowercase
461
+ normalized = name.lower().strip()
462
+
463
+ # Remove common suffixes
464
+ suffixes = [
465
+ " inc.", " inc", " corp.", " corp", " corporation",
466
+ " ltd.", " ltd", " limited", " llc", " plc",
467
+ " co.", " co", " company",
468
+ ]
469
+ for suffix in suffixes:
470
+ if normalized.endswith(suffix):
471
+ normalized = normalized[:-len(suffix)]
472
+
473
+ return normalized.strip()
474
+
475
+ def _select_best_match(
476
+ self,
477
+ query_name: str,
478
+ candidates: list[tuple],
479
+ extracted_role: Optional[str],
480
+ extracted_org: Optional[str],
481
+ context: PipelineContext,
482
+ ) -> Optional[tuple]:
483
+ """
484
+ Select the best match from candidates.
485
+
486
+ Uses LLM if available, otherwise returns top match if confidence is high enough.
487
+ """
488
+ if not candidates:
489
+ return None
490
+
491
+ # If only one strong match, use it directly
492
+ if len(candidates) == 1 and candidates[0][2] >= 0.9:
493
+ logger.info(f" Single strong match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
494
+ return candidates[0]
495
+
496
+ # Try LLM confirmation
497
+ if self._llm is not None:
498
+ try:
499
+ return self._llm_select_match(query_name, candidates, extracted_role, extracted_org, context)
500
+ except Exception as e:
501
+ logger.warning(f" LLM confirmation failed: {e}")
502
+
503
+ # Fallback: use top match if boosted score is high enough
504
+ top_record, top_similarity, top_boosted = candidates[0]
505
+ if top_boosted >= 0.85:
506
+ logger.info(f" No LLM, using top match: '{top_record.name}' (boosted={top_boosted:.3f})")
507
+ return candidates[0]
508
+
509
+ logger.info(f" No confident match (top boosted={top_boosted:.3f} < 0.85)")
510
+ return None
511
+
512
+ def _llm_select_match(
513
+ self,
514
+ query_name: str,
515
+ candidates: list[tuple],
516
+ extracted_role: Optional[str],
517
+ extracted_org: Optional[str],
518
+ context: PipelineContext,
519
+ ) -> Optional[tuple]:
520
+ """Use LLM to select the best match."""
521
+ # Format candidates for prompt
522
+ candidate_lines = []
523
+ for i, (record, similarity, boosted) in enumerate(candidates[:10], 1):
524
+ role_str = f", {record.known_for_role}" if record.known_for_role else ""
525
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
526
+ country_str = f", {record.country}" if record.country else ""
527
+ candidate_lines.append(
528
+ f"{i}. {record.name}{role_str}{org_str}{country_str} (score: {boosted:.2f})"
529
+ )
530
+
531
+ # Build context info from extracted role/org
532
+ context_parts = []
533
+ if extracted_role:
534
+ context_parts.append(f"role={extracted_role}")
535
+ if extracted_org:
536
+ context_parts.append(f"org={extracted_org}")
537
+ context_info = ", ".join(context_parts) if context_parts else "no role/org extracted"
538
+
539
+ # Source text preview
540
+ source_preview = ""
541
+ if context.source_text:
542
+ source_preview = context.source_text[:300] + "..." if len(context.source_text) > 300 else context.source_text
543
+
544
+ prompt = PERSON_MATCH_PROMPT.format(
545
+ query_name=query_name,
546
+ context_info=context_info,
547
+ source_preview=source_preview,
548
+ candidates="\n".join(candidate_lines),
549
+ )
550
+
551
+ # Get LLM response
552
+ assert self._llm is not None
553
+ response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
554
+ response = response.strip()
555
+
556
+ logger.info(f" LLM response for '{query_name}': {response}")
557
+
558
+ # Parse response
559
+ if response.upper() == "NONE":
560
+ logger.info(f" LLM chose: NONE (no match)")
561
+ return None
562
+
563
+ try:
564
+ idx = int(response) - 1
565
+ if 0 <= idx < len(candidates):
566
+ chosen = candidates[idx]
567
+ logger.info(f" LLM chose: #{idx + 1} '{chosen[0].name}' (boosted={chosen[2]:.3f})")
568
+ return chosen
569
+ except ValueError:
570
+ logger.warning(f" LLM response '{response}' could not be parsed as number")
571
+
572
+ # Fallback to top match if LLM response is unclear and score is decent
573
+ if candidates[0][2] >= 0.8:
574
+ logger.info(f" Fallback to top match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
575
+ return candidates[0]
576
+
577
+ logger.info(f" No confident match (top boosted={candidates[0][2]:.3f} < 0.8)")
578
+ return None
124
579
 
125
580
  def _extract_with_llm(
126
581
  self,
@@ -216,6 +671,115 @@ Should return:
216
671
 
217
672
  return None
218
673
 
674
+ def _build_canonical_entity(
675
+ self,
676
+ entity: ExtractedEntity,
677
+ qualifiers: Optional[EntityQualifiers],
678
+ canonical_match: Optional[CanonicalMatch],
679
+ ) -> CanonicalEntity:
680
+ """Build CanonicalEntity from qualifiers and optional canonical match."""
681
+ # Ensure qualifiers is not None
682
+ if qualifiers is None:
683
+ qualifiers = EntityQualifiers()
684
+
685
+ # If we have a canonical match, add wikidata ID to identifiers
686
+ identifiers: dict[str, str] = dict(qualifiers.identifiers) if qualifiers.identifiers else {}
687
+ resolved_role: Optional[ResolvedRole] = None
688
+ resolved_org: Optional[ResolvedOrganization] = None
689
+
690
+ if canonical_match:
691
+ match_details = canonical_match.match_details or {}
692
+ source_id = str(match_details.get("source_id", ""))
693
+ if source_id:
694
+ identifiers["wikidata_id"] = source_id
695
+ if canonical_match.canonical_id:
696
+ identifiers["canonical_id"] = canonical_match.canonical_id
697
+
698
+ # Extract role and org from database match
699
+ known_role = str(match_details.get("known_for_role", "") or "")
700
+ known_org = str(match_details.get("known_for_org", "") or "")
701
+
702
+ # Create ResolvedRole from database match
703
+ if known_role:
704
+ resolved_role = ResolvedRole(
705
+ canonical_name=known_role,
706
+ canonical_id=None, # Role ID would need separate lookup
707
+ source="wikidata",
708
+ source_id=source_id if source_id else None,
709
+ )
710
+
711
+ # Update qualifiers with info from database if not already set
712
+ if not qualifiers.role and known_role:
713
+ final_role = known_role
714
+ final_org = qualifiers.org or known_org or None
715
+ else:
716
+ final_role = qualifiers.role
717
+ final_org = qualifiers.org
718
+ else:
719
+ final_role = qualifiers.role
720
+ final_org = qualifiers.org
721
+
722
+ # Resolve organization against the organization database
723
+ org_to_resolve = final_org
724
+ if org_to_resolve:
725
+ logger.debug(f" Resolving organization: '{org_to_resolve}'")
726
+ resolved_org = self._resolve_organization(org_to_resolve)
727
+ if resolved_org:
728
+ logger.info(f" Resolved org: '{org_to_resolve}' -> '{resolved_org.canonical_name}' ({resolved_org.canonical_id})")
729
+
730
+ # Build the final qualifiers with resolved info
731
+ qualifiers = EntityQualifiers(
732
+ role=final_role,
733
+ org=final_org,
734
+ identifiers=identifiers,
735
+ resolved_role=resolved_role,
736
+ resolved_org=resolved_org,
737
+ )
738
+
739
+ # Create QualifiedEntity
740
+ qualified = QualifiedEntity(
741
+ entity_ref=entity.entity_ref,
742
+ original_text=entity.text,
743
+ entity_type=entity.type,
744
+ qualifiers=qualifiers,
745
+ qualification_sources=[self.name],
746
+ )
747
+
748
+ # Build FQN - prefer resolved names when available
749
+ if canonical_match and canonical_match.canonical_name:
750
+ # Use canonical person name from database
751
+ fqn_parts: list[str] = [canonical_match.canonical_name]
752
+ if qualifiers.role:
753
+ fqn_parts.append(f"({qualifiers.role})")
754
+ # Use resolved org name if available
755
+ if resolved_org:
756
+ fqn_parts.append(f"at {resolved_org.canonical_name}")
757
+ elif qualifiers.org:
758
+ fqn_parts.append(f"at {qualifiers.org}")
759
+ fqn = " ".join(fqn_parts)
760
+ else:
761
+ # Build FQN: "Person Name (Role, Org)" or "Person Name (Role)" or "Person Name (Org)"
762
+ fqn_parts_for_display: list[str] = []
763
+ if qualifiers.role:
764
+ fqn_parts_for_display.append(qualifiers.role)
765
+ # Use resolved org name if available
766
+ if resolved_org:
767
+ fqn_parts_for_display.append(resolved_org.canonical_name)
768
+ elif qualifiers.org:
769
+ fqn_parts_for_display.append(qualifiers.org)
770
+
771
+ if fqn_parts_for_display:
772
+ fqn = f"{entity.text} ({', '.join(fqn_parts_for_display)})"
773
+ else:
774
+ fqn = entity.text
775
+
776
+ return CanonicalEntity(
777
+ entity_ref=entity.entity_ref,
778
+ qualified_entity=qualified,
779
+ canonical_match=canonical_match,
780
+ fqn=fqn,
781
+ )
782
+
219
783
 
220
784
  # Allow importing without decorator for testing
221
785
  PersonQualifierPluginClass = PersonQualifierPlugin