corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -1,9 +1,12 @@
1
1
  """
2
- PersonQualifierPlugin - Qualifies PERSON entities with role and organization.
2
+ PersonQualifierPlugin - Qualifies PERSON entities with role, organization, and canonical ID.
3
3
 
4
4
  Uses Gemma3 12B (instruction-tuned) to extract:
5
5
  - role: Job title/position (e.g., "CEO", "President")
6
6
  - org: Organization/employer (e.g., "Apple Inc", "Microsoft")
7
+
8
+ Then searches the person database to find canonical matches for notable people
9
+ (those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
7
10
  """
8
11
 
9
12
  import json
@@ -14,19 +17,51 @@ from typing import Optional
14
17
  from ..base import BaseQualifierPlugin, PluginCapability
15
18
  from ...pipeline.context import PipelineContext
16
19
  from ...pipeline.registry import PluginRegistry
17
- from ...models import ExtractedEntity, EntityQualifiers, EntityType
20
+ from ...models import (
21
+ ExtractedEntity,
22
+ EntityQualifiers,
23
+ EntityType,
24
+ QualifiedEntity,
25
+ CanonicalEntity,
26
+ CanonicalMatch,
27
+ ResolvedRole,
28
+ ResolvedOrganization,
29
+ )
18
30
  from ...llm import LLM
19
31
 
20
32
  logger = logging.getLogger(__name__)
21
33
 
22
34
 
35
+ # LLM prompt template for person matching confirmation
36
+ PERSON_MATCH_PROMPT = """You are matching a person name extracted from text to a database of notable people.
37
+
38
+ Extracted name: "{query_name}"
39
+ Context from text: {context_info}
40
+ Source text: "{source_preview}"
41
+
42
+ Candidates from database (with Wikipedia info):
43
+ {candidates}
44
+
45
+ Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
46
+
47
+ Rules:
48
+ - The match should refer to the same person
49
+ - Consider whether the role and organization from the text match the Wikipedia info
50
+ - Different people with similar names should NOT match
51
+ - If the extracted name is too generic or ambiguous, respond "NONE"
52
+
53
+ Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
54
+ """
55
+
56
+
23
57
  @PluginRegistry.qualifier
24
58
  class PersonQualifierPlugin(BaseQualifierPlugin):
25
59
  """
26
60
  Qualifier plugin for PERSON entities.
27
61
 
28
62
  Uses Gemma3 12B to extract role and organization from context.
29
- Falls back to pattern matching if model is not available.
63
+ Then searches the person database to find canonical matches for notable people.
64
+ Falls back to pattern matching if LLM is not available.
30
65
  """
31
66
 
32
67
  # Common role patterns for fallback
@@ -45,6 +80,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
45
80
  gguf_file: Optional[str] = None,
46
81
  use_llm: bool = True,
47
82
  use_4bit: bool = True,
83
+ use_database: bool = True,
84
+ db_path: Optional[str] = None,
85
+ top_k: int = 10,
86
+ min_similarity: float = 0.5,
87
+ auto_download_db: bool = True,
48
88
  ):
49
89
  """
50
90
  Initialize the person qualifier.
@@ -54,8 +94,19 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
54
94
  gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
55
95
  use_llm: Whether to use LLM
56
96
  use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
97
+ use_database: Whether to use person database for canonical matching
98
+ db_path: Path to database (auto-detects if None)
99
+ top_k: Number of candidates to retrieve from database
100
+ min_similarity: Minimum similarity threshold for database matches
101
+ auto_download_db: Whether to auto-download database from HuggingFace
57
102
  """
58
103
  self._use_llm = use_llm
104
+ self._use_database = use_database
105
+ self._db_path = db_path
106
+ self._top_k = top_k
107
+ self._min_similarity = min_similarity
108
+ self._auto_download_db = auto_download_db
109
+
59
110
  self._llm: Optional[LLM] = None
60
111
  if use_llm:
61
112
  self._llm = LLM(
@@ -64,6 +115,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
64
115
  use_4bit=use_4bit,
65
116
  )
66
117
 
118
+ # Lazy-loaded components
119
+ self._database = None
120
+ self._embedder = None
121
+ self._cache: dict[str, Optional[CanonicalEntity]] = {}
122
+
67
123
  @property
68
124
  def name(self) -> str:
69
125
  return "person_qualifier"
@@ -74,14 +130,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
74
130
 
75
131
  @property
76
132
  def capabilities(self) -> PluginCapability:
77
- caps = PluginCapability.NONE
133
+ caps = PluginCapability.CACHING
78
134
  if self._use_llm:
79
135
  caps |= PluginCapability.LLM_REQUIRED
80
136
  return caps
81
137
 
82
138
  @property
83
139
  def description(self) -> str:
84
- return "Extracts role and organization for PERSON entities using Gemma3"
140
+ return "Extracts role and organization for PERSON entities, with optional database lookup for notable people"
85
141
 
86
142
  @property
87
143
  def supported_entity_types(self) -> set[EntityType]:
@@ -89,38 +145,447 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
89
145
 
90
146
  @property
91
147
  def provided_identifier_types(self) -> list[str]:
92
- return [] # Provides qualifiers, not identifiers
148
+ return ["wikidata_id"]
149
+
150
+ def _get_database(self):
151
+ """Get or initialize the person database."""
152
+ if self._database is not None:
153
+ return self._database
154
+
155
+ if not self._use_database:
156
+ return None
157
+
158
+ try:
159
+ from ...database.store import get_person_database
160
+ from ...database.hub import get_database_path
161
+
162
+ # Find database path
163
+ db_path = self._db_path
164
+ if db_path is None:
165
+ db_path = get_database_path(auto_download=self._auto_download_db)
166
+
167
+ if db_path is None:
168
+ logger.warning("Person database not available. Skipping database qualification.")
169
+ return None
170
+
171
+ # Use singleton to ensure database is only loaded once
172
+ self._database = get_person_database(db_path=db_path)
173
+ logger.info(f"Loaded person database from {db_path}")
174
+ return self._database
175
+
176
+ except Exception as e:
177
+ logger.warning(f"Failed to load person database: {e}")
178
+ return None
179
+
180
+ def _get_embedder(self):
181
+ """Get or initialize the embedder."""
182
+ if self._embedder is not None:
183
+ return self._embedder
184
+
185
+ try:
186
+ from ...database import CompanyEmbedder
187
+ self._embedder = CompanyEmbedder()
188
+ return self._embedder
189
+ except Exception as e:
190
+ logger.warning(f"Failed to load embedder: {e}")
191
+ return None
192
+
193
+ def _get_org_resolver(self):
194
+ """Get or initialize the organization resolver."""
195
+ if not hasattr(self, '_org_resolver'):
196
+ self._org_resolver = None
197
+
198
+ if self._org_resolver is not None:
199
+ return self._org_resolver
200
+
201
+ try:
202
+ from ...database.resolver import get_organization_resolver
203
+ self._org_resolver = get_organization_resolver(
204
+ db_path=self._db_path,
205
+ auto_download_db=self._auto_download_db,
206
+ )
207
+ return self._org_resolver
208
+ except Exception as e:
209
+ logger.warning(f"Failed to initialize organization resolver: {e}")
210
+ return None
211
+
212
+ def _resolve_organization(self, org_name: str) -> Optional[ResolvedOrganization]:
213
+ """
214
+ Resolve an organization name against the organization database.
215
+
216
+ Uses the shared OrganizationResolver utility.
217
+
218
+ Args:
219
+ org_name: Organization name to resolve
220
+
221
+ Returns:
222
+ ResolvedOrganization if found, None otherwise
223
+ """
224
+ resolver = self._get_org_resolver()
225
+ if resolver is None:
226
+ return None
227
+
228
+ return resolver.resolve(org_name)
93
229
 
94
230
  def qualify(
95
231
  self,
96
232
  entity: ExtractedEntity,
97
233
  context: PipelineContext,
98
- ) -> Optional[EntityQualifiers]:
234
+ ) -> Optional[CanonicalEntity]:
99
235
  """
100
- Qualify a PERSON entity with role and organization.
236
+ Qualify a PERSON entity with role, organization, and optionally canonical ID.
101
237
 
102
238
  Args:
103
239
  entity: The PERSON entity to qualify
104
240
  context: Pipeline context for accessing source text
105
241
 
106
242
  Returns:
107
- EntityQualifiers with role and org, or None if nothing found
243
+ CanonicalEntity with role/org qualifiers and FQN, or None if nothing found
108
244
  """
109
245
  if entity.type != EntityType.PERSON:
110
246
  return None
111
247
 
248
+ # Check cache
249
+ cache_key = entity.text.lower().strip()
250
+ if cache_key in self._cache:
251
+ return self._cache[cache_key]
252
+
112
253
  # Use the full source text for LLM qualification
113
- # This provides maximum context for understanding the person's role/org
114
254
  full_text = context.source_text
115
255
 
116
- # Try LLM extraction first with full text
256
+ # Step 1: Extract role and org using LLM or patterns
257
+ qualifiers: Optional[EntityQualifiers] = None
117
258
  if self._llm is not None:
118
259
  result = self._extract_with_llm(entity.text, full_text)
119
260
  if result and (result.role or result.org):
120
- return result
261
+ qualifiers = result
262
+
263
+ # Fallback to pattern matching
264
+ if qualifiers is None:
265
+ qualifiers = self._extract_with_patterns(entity.text, full_text)
266
+
267
+ # Step 2: Search database for canonical match (if database is available)
268
+ canonical_match = None
269
+ if self._use_database:
270
+ canonical_match = self._search_database(
271
+ entity.text,
272
+ qualifiers.role if qualifiers else None,
273
+ qualifiers.org if qualifiers else None,
274
+ context,
275
+ )
121
276
 
122
- # Fallback to pattern matching with full text
123
- return self._extract_with_patterns(entity.text, full_text)
277
+ # If no qualifiers found and no database match, return None
278
+ if qualifiers is None and canonical_match is None:
279
+ self._cache[cache_key] = None
280
+ return None
281
+
282
+ # Step 3: Build CanonicalEntity
283
+ result = self._build_canonical_entity(entity, qualifiers, canonical_match)
284
+ self._cache[cache_key] = result
285
+ return result
286
+
287
+ def _search_database(
288
+ self,
289
+ person_name: str,
290
+ extracted_role: Optional[str],
291
+ extracted_org: Optional[str],
292
+ context: PipelineContext,
293
+ ) -> Optional[CanonicalMatch]:
294
+ """
295
+ Search the person database for a canonical match.
296
+
297
+ Uses embedding similarity + role/org matching for disambiguation.
298
+
299
+ Args:
300
+ person_name: Name of the person
301
+ extracted_role: Role extracted from text (e.g., "CEO")
302
+ extracted_org: Organization extracted from text (e.g., "Apple Inc")
303
+ context: Pipeline context
304
+
305
+ Returns:
306
+ CanonicalMatch if a confident match is found, None otherwise
307
+ """
308
+ database = self._get_database()
309
+ if database is None:
310
+ return None
311
+
312
+ embedder = self._get_embedder()
313
+ if embedder is None:
314
+ return None
315
+
316
+ # Embed the person name
317
+ logger.debug(f" Embedding person name: '{person_name}'")
318
+ query_embedding = embedder.embed(person_name)
319
+
320
+ # Search database with text pre-filtering
321
+ logger.debug(f" Searching person database...")
322
+ results = database.search(
323
+ query_embedding,
324
+ top_k=self._top_k,
325
+ query_text=person_name,
326
+ )
327
+
328
+ # Filter by minimum similarity
329
+ results = [(r, s) for r, s in results if s >= self._min_similarity]
330
+
331
+ if not results:
332
+ logger.debug(f" No person matches found above threshold {self._min_similarity}")
333
+ return None
334
+
335
+ # Boost scores based on role/org matching
336
+ scored_results = []
337
+ for record, similarity in results:
338
+ boosted_score = self._compute_match_score(
339
+ record, similarity, extracted_role, extracted_org
340
+ )
341
+ scored_results.append((record, similarity, boosted_score))
342
+
343
+ # Sort by boosted score
344
+ scored_results.sort(key=lambda x: x[2], reverse=True)
345
+
346
+ # Log top candidates
347
+ logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
348
+ for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
349
+ role_str = f" ({record.known_for_role})" if record.known_for_role else ""
350
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
351
+ logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
352
+
353
+ # Select best match using LLM if available
354
+ logger.info(f" Selecting best match (LLM={self._llm is not None})...")
355
+ best_match = self._select_best_match(person_name, scored_results, extracted_role, extracted_org, context)
356
+
357
+ if best_match is None:
358
+ logger.info(f" No confident match for '{person_name}'")
359
+ return None
360
+
361
+ record, similarity, boosted = best_match
362
+ logger.info(f" Matched: '{record.name}' (wikidata:{record.source_id}, similarity={similarity:.3f})")
363
+
364
+ # Build canonical match
365
+ return CanonicalMatch(
366
+ canonical_id=f"wikidata:{record.source_id}",
367
+ canonical_name=record.name,
368
+ match_method="embedding",
369
+ match_confidence=min(max(boosted, 0.0), 1.0),
370
+ match_details={
371
+ "source": "wikidata",
372
+ "source_id": record.source_id,
373
+ "similarity": similarity,
374
+ "known_for_role": record.known_for_role,
375
+ "known_for_org": record.known_for_org,
376
+ "birth_date": record.birth_date,
377
+ "death_date": record.death_date,
378
+ "is_historic": record.is_historic,
379
+ },
380
+ )
381
+
382
+ def _compute_match_score(
383
+ self,
384
+ record,
385
+ embedding_similarity: float,
386
+ extracted_role: Optional[str],
387
+ extracted_org: Optional[str],
388
+ ) -> float:
389
+ """
390
+ Compute boosted match score using role/org context.
391
+
392
+ Boosts similarity score if extracted role/org matches database record.
393
+ """
394
+ score = embedding_similarity
395
+
396
+ # Boost if role matches (fuzzy)
397
+ if extracted_role and record.known_for_role:
398
+ if self._role_matches(extracted_role, record.known_for_role):
399
+ score += 0.1 # +10% boost
400
+ logger.debug(f" Role match boost: {extracted_role} ~ {record.known_for_role}")
401
+
402
+ # Boost if org matches (fuzzy)
403
+ if extracted_org and record.known_for_org:
404
+ if self._org_matches(extracted_org, record.known_for_org):
405
+ score += 0.15 # +15% boost (org is stronger signal)
406
+ logger.debug(f" Org match boost: {extracted_org} ~ {record.known_for_org}")
407
+
408
+ return min(score, 1.0) # Cap at 1.0
409
+
410
+ def _role_matches(self, extracted: str, known: str) -> bool:
411
+ """Fuzzy role matching."""
412
+ extracted_lower = extracted.lower().strip()
413
+ known_lower = known.lower().strip()
414
+
415
+ # Exact match
416
+ if extracted_lower == known_lower:
417
+ return True
418
+
419
+ # CEO variants
420
+ ceo_variants = {"ceo", "chief executive", "chief executive officer"}
421
+ if extracted_lower in ceo_variants and known_lower in ceo_variants:
422
+ return True
423
+
424
+ # CFO variants
425
+ cfo_variants = {"cfo", "chief financial officer"}
426
+ if extracted_lower in cfo_variants and known_lower in cfo_variants:
427
+ return True
428
+
429
+ # President variants
430
+ president_variants = {"president", "chairman", "chairman and ceo"}
431
+ if extracted_lower in president_variants and known_lower in president_variants:
432
+ return True
433
+
434
+ # Founder variants
435
+ founder_variants = {"founder", "co-founder", "cofounder", "founding member"}
436
+ if extracted_lower in founder_variants and known_lower in founder_variants:
437
+ return True
438
+
439
+ # Contains check for partial matches
440
+ if extracted_lower in known_lower or known_lower in extracted_lower:
441
+ return True
442
+
443
+ return False
444
+
445
+ def _org_matches(self, extracted: str, known: str) -> bool:
446
+ """Fuzzy org matching using simple normalization."""
447
+ # Normalize both
448
+ extracted_norm = self._normalize_org_name(extracted)
449
+ known_norm = self._normalize_org_name(known)
450
+
451
+ # Exact normalized match
452
+ if extracted_norm == known_norm:
453
+ return True
454
+
455
+ # Check if one contains the other (e.g., "Apple" in "Apple Inc")
456
+ if extracted_norm in known_norm or known_norm in extracted_norm:
457
+ return True
458
+
459
+ return False
460
+
461
+ def _normalize_org_name(self, name: str) -> str:
462
+ """Simple org name normalization."""
463
+ # Lowercase
464
+ normalized = name.lower().strip()
465
+
466
+ # Remove common suffixes
467
+ suffixes = [
468
+ " inc.", " inc", " corp.", " corp", " corporation",
469
+ " ltd.", " ltd", " limited", " llc", " plc",
470
+ " co.", " co", " company",
471
+ ]
472
+ for suffix in suffixes:
473
+ if normalized.endswith(suffix):
474
+ normalized = normalized[:-len(suffix)]
475
+
476
+ return normalized.strip()
477
+
478
+ def _select_best_match(
479
+ self,
480
+ query_name: str,
481
+ candidates: list[tuple],
482
+ extracted_role: Optional[str],
483
+ extracted_org: Optional[str],
484
+ context: PipelineContext,
485
+ ) -> Optional[tuple]:
486
+ """
487
+ Select the best match from candidates.
488
+
489
+ Uses LLM if available, otherwise returns top match if confidence is high enough.
490
+ """
491
+ if not candidates:
492
+ return None
493
+
494
+ # If only one strong match, use it directly
495
+ if len(candidates) == 1 and candidates[0][2] >= 0.9:
496
+ logger.info(f" Single strong match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
497
+ return candidates[0]
498
+
499
+ # Try LLM confirmation
500
+ if self._llm is not None:
501
+ try:
502
+ return self._llm_select_match(query_name, candidates, extracted_role, extracted_org, context)
503
+ except Exception as e:
504
+ logger.warning(f" LLM confirmation failed: {e}")
505
+
506
+ # Fallback: use top match if boosted score is high enough
507
+ top_record, top_similarity, top_boosted = candidates[0]
508
+ if top_boosted >= 0.85:
509
+ logger.info(f" No LLM, using top match: '{top_record.name}' (boosted={top_boosted:.3f})")
510
+ return candidates[0]
511
+
512
+ logger.info(f" No confident match (top boosted={top_boosted:.3f} < 0.85)")
513
+ return None
514
+
515
+ def _llm_select_match(
516
+ self,
517
+ query_name: str,
518
+ candidates: list[tuple],
519
+ extracted_role: Optional[str],
520
+ extracted_org: Optional[str],
521
+ context: PipelineContext,
522
+ ) -> Optional[tuple]:
523
+ """Use LLM to select the best match."""
524
+ # Format candidates for prompt
525
+ candidate_lines = []
526
+ for i, (record, similarity, boosted) in enumerate(candidates[:10], 1):
527
+ role_str = f", {record.known_for_role}" if record.known_for_role else ""
528
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
529
+ country_str = f", {record.country}" if record.country else ""
530
+ # Include life dates for context (helps identify historic figures)
531
+ dates_parts = []
532
+ if record.birth_date:
533
+ dates_parts.append(f"b. {record.birth_date[:4]}") # Just year
534
+ if record.death_date:
535
+ dates_parts.append(f"d. {record.death_date[:4]}") # Just year
536
+ dates_str = f" [{' - '.join(dates_parts)}]" if dates_parts else ""
537
+ candidate_lines.append(
538
+ f"{i}. {record.name}{role_str}{org_str}{country_str}{dates_str} (score: {boosted:.2f})"
539
+ )
540
+
541
+ # Build context info from extracted role/org
542
+ context_parts = []
543
+ if extracted_role:
544
+ context_parts.append(f"role={extracted_role}")
545
+ if extracted_org:
546
+ context_parts.append(f"org={extracted_org}")
547
+ context_info = ", ".join(context_parts) if context_parts else "no role/org extracted"
548
+
549
+ # Source text preview
550
+ source_preview = ""
551
+ if context.source_text:
552
+ source_preview = context.source_text[:300] + "..." if len(context.source_text) > 300 else context.source_text
553
+
554
+ prompt = PERSON_MATCH_PROMPT.format(
555
+ query_name=query_name,
556
+ context_info=context_info,
557
+ source_preview=source_preview,
558
+ candidates="\n".join(candidate_lines),
559
+ )
560
+
561
+ # Get LLM response
562
+ assert self._llm is not None
563
+ response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
564
+ response = response.strip()
565
+
566
+ logger.info(f" LLM response for '{query_name}': {response}")
567
+
568
+ # Parse response
569
+ if response.upper() == "NONE":
570
+ logger.info(f" LLM chose: NONE (no match)")
571
+ return None
572
+
573
+ try:
574
+ idx = int(response) - 1
575
+ if 0 <= idx < len(candidates):
576
+ chosen = candidates[idx]
577
+ logger.info(f" LLM chose: #{idx + 1} '{chosen[0].name}' (boosted={chosen[2]:.3f})")
578
+ return chosen
579
+ except ValueError:
580
+ logger.warning(f" LLM response '{response}' could not be parsed as number")
581
+
582
+ # Fallback to top match if LLM response is unclear and score is decent
583
+ if candidates[0][2] >= 0.8:
584
+ logger.info(f" Fallback to top match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
585
+ return candidates[0]
586
+
587
+ logger.info(f" No confident match (top boosted={candidates[0][2]:.3f} < 0.8)")
588
+ return None
124
589
 
125
590
  def _extract_with_llm(
126
591
  self,
@@ -216,6 +681,115 @@ Should return:
216
681
 
217
682
  return None
218
683
 
684
+ def _build_canonical_entity(
685
+ self,
686
+ entity: ExtractedEntity,
687
+ qualifiers: Optional[EntityQualifiers],
688
+ canonical_match: Optional[CanonicalMatch],
689
+ ) -> CanonicalEntity:
690
+ """Build CanonicalEntity from qualifiers and optional canonical match."""
691
+ # Ensure qualifiers is not None
692
+ if qualifiers is None:
693
+ qualifiers = EntityQualifiers()
694
+
695
+ # If we have a canonical match, add wikidata ID to identifiers
696
+ identifiers: dict[str, str] = dict(qualifiers.identifiers) if qualifiers.identifiers else {}
697
+ resolved_role: Optional[ResolvedRole] = None
698
+ resolved_org: Optional[ResolvedOrganization] = None
699
+
700
+ if canonical_match:
701
+ match_details = canonical_match.match_details or {}
702
+ source_id = str(match_details.get("source_id", ""))
703
+ if source_id:
704
+ identifiers["wikidata_id"] = source_id
705
+ if canonical_match.canonical_id:
706
+ identifiers["canonical_id"] = canonical_match.canonical_id
707
+
708
+ # Extract role and org from database match
709
+ known_role = str(match_details.get("known_for_role", "") or "")
710
+ known_org = str(match_details.get("known_for_org", "") or "")
711
+
712
+ # Create ResolvedRole from database match
713
+ if known_role:
714
+ resolved_role = ResolvedRole(
715
+ canonical_name=known_role,
716
+ canonical_id=None, # Role ID would need separate lookup
717
+ source="wikidata",
718
+ source_id=source_id if source_id else None,
719
+ )
720
+
721
+ # Update qualifiers with info from database if not already set
722
+ if not qualifiers.role and known_role:
723
+ final_role = known_role
724
+ final_org = qualifiers.org or known_org or None
725
+ else:
726
+ final_role = qualifiers.role
727
+ final_org = qualifiers.org
728
+ else:
729
+ final_role = qualifiers.role
730
+ final_org = qualifiers.org
731
+
732
+ # Resolve organization against the organization database
733
+ org_to_resolve = final_org
734
+ if org_to_resolve:
735
+ logger.debug(f" Resolving organization: '{org_to_resolve}'")
736
+ resolved_org = self._resolve_organization(org_to_resolve)
737
+ if resolved_org:
738
+ logger.info(f" Resolved org: '{org_to_resolve}' -> '{resolved_org.canonical_name}' ({resolved_org.canonical_id})")
739
+
740
+ # Build the final qualifiers with resolved info
741
+ qualifiers = EntityQualifiers(
742
+ role=final_role,
743
+ org=final_org,
744
+ identifiers=identifiers,
745
+ resolved_role=resolved_role,
746
+ resolved_org=resolved_org,
747
+ )
748
+
749
+ # Create QualifiedEntity
750
+ qualified = QualifiedEntity(
751
+ entity_ref=entity.entity_ref,
752
+ original_text=entity.text,
753
+ entity_type=entity.type,
754
+ qualifiers=qualifiers,
755
+ qualification_sources=[self.name],
756
+ )
757
+
758
+ # Build FQN - prefer resolved names when available
759
+ if canonical_match and canonical_match.canonical_name:
760
+ # Use canonical person name from database
761
+ fqn_parts: list[str] = [canonical_match.canonical_name]
762
+ if qualifiers.role:
763
+ fqn_parts.append(f"({qualifiers.role})")
764
+ # Use resolved org name if available
765
+ if resolved_org:
766
+ fqn_parts.append(f"at {resolved_org.canonical_name}")
767
+ elif qualifiers.org:
768
+ fqn_parts.append(f"at {qualifiers.org}")
769
+ fqn = " ".join(fqn_parts)
770
+ else:
771
+ # Build FQN: "Person Name (Role, Org)" or "Person Name (Role)" or "Person Name (Org)"
772
+ fqn_parts_for_display: list[str] = []
773
+ if qualifiers.role:
774
+ fqn_parts_for_display.append(qualifiers.role)
775
+ # Use resolved org name if available
776
+ if resolved_org:
777
+ fqn_parts_for_display.append(resolved_org.canonical_name)
778
+ elif qualifiers.org:
779
+ fqn_parts_for_display.append(qualifiers.org)
780
+
781
+ if fqn_parts_for_display:
782
+ fqn = f"{entity.text} ({', '.join(fqn_parts_for_display)})"
783
+ else:
784
+ fqn = entity.text
785
+
786
+ return CanonicalEntity(
787
+ entity_ref=entity.entity_ref,
788
+ qualified_entity=qualified,
789
+ canonical_match=canonical_match,
790
+ fqn=fqn,
791
+ )
792
+
219
793
 
220
794
  # Allow importing without decorator for testing
221
795
  PersonQualifierPluginClass = PersonQualifierPlugin