corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,785 @@
1
+ """
2
+ PersonQualifierPlugin - Qualifies PERSON entities with role, organization, and canonical ID.
3
+
4
+ Uses Gemma3 12B (instruction-tuned) to extract:
5
+ - role: Job title/position (e.g., "CEO", "President")
6
+ - org: Organization/employer (e.g., "Apple Inc", "Microsoft")
7
+
8
+ Then searches the person database to find canonical matches for notable people
9
+ (those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import re
15
+ from typing import Optional
16
+
17
+ from ..base import BaseQualifierPlugin, PluginCapability
18
+ from ...pipeline.context import PipelineContext
19
+ from ...pipeline.registry import PluginRegistry
20
+ from ...models import (
21
+ ExtractedEntity,
22
+ EntityQualifiers,
23
+ EntityType,
24
+ QualifiedEntity,
25
+ CanonicalEntity,
26
+ CanonicalMatch,
27
+ ResolvedRole,
28
+ ResolvedOrganization,
29
+ )
30
+ from ...llm import LLM
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # LLM prompt template for person matching confirmation
36
+ PERSON_MATCH_PROMPT = """You are matching a person name extracted from text to a database of notable people.
37
+
38
+ Extracted name: "{query_name}"
39
+ Context from text: {context_info}
40
+ Source text: "{source_preview}"
41
+
42
+ Candidates from database (with Wikipedia info):
43
+ {candidates}
44
+
45
+ Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
46
+
47
+ Rules:
48
+ - The match should refer to the same person
49
+ - Consider whether the role and organization from the text match the Wikipedia info
50
+ - Different people with similar names should NOT match
51
+ - If the extracted name is too generic or ambiguous, respond "NONE"
52
+
53
+ Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
54
+ """
55
+
56
+
57
+ @PluginRegistry.qualifier
58
+ class PersonQualifierPlugin(BaseQualifierPlugin):
59
+ """
60
+ Qualifier plugin for PERSON entities.
61
+
62
+ Uses Gemma3 12B to extract role and organization from context.
63
+ Then searches the person database to find canonical matches for notable people.
64
+ Falls back to pattern matching if LLM is not available.
65
+ """
66
+
67
+ # Common role patterns for fallback
68
+ ROLE_PATTERNS = [
69
+ r"\b(CEO|CFO|CTO|COO|CMO|CIO|CISO|CSO)\b",
70
+ r"\b(Chief\s+\w+\s+Officer)\b",
71
+ r"\b(President|Chairman|Director|Manager|Executive|Founder|Co-Founder)\b",
72
+ r"\b(Vice\s+President|VP)\b",
73
+ r"\b(Head\s+of\s+\w+)\b",
74
+ r"\b(Senior\s+\w+|Lead\s+\w+|Principal\s+\w+)\b",
75
+ ]
76
+
77
+ def __init__(
78
+ self,
79
+ model_id: str = "google/gemma-3-12b-it-qat-q4_0-gguf",
80
+ gguf_file: Optional[str] = None,
81
+ use_llm: bool = True,
82
+ use_4bit: bool = True,
83
+ use_database: bool = True,
84
+ db_path: Optional[str] = None,
85
+ top_k: int = 10,
86
+ min_similarity: float = 0.5,
87
+ auto_download_db: bool = True,
88
+ ):
89
+ """
90
+ Initialize the person qualifier.
91
+
92
+ Args:
93
+ model_id: HuggingFace model ID for LLM qualification
94
+ gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
95
+ use_llm: Whether to use LLM
96
+ use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
97
+ use_database: Whether to use person database for canonical matching
98
+ db_path: Path to database (auto-detects if None)
99
+ top_k: Number of candidates to retrieve from database
100
+ min_similarity: Minimum similarity threshold for database matches
101
+ auto_download_db: Whether to auto-download database from HuggingFace
102
+ """
103
+ self._use_llm = use_llm
104
+ self._use_database = use_database
105
+ self._db_path = db_path
106
+ self._top_k = top_k
107
+ self._min_similarity = min_similarity
108
+ self._auto_download_db = auto_download_db
109
+
110
+ self._llm: Optional[LLM] = None
111
+ if use_llm:
112
+ self._llm = LLM(
113
+ model_id=model_id,
114
+ gguf_file=gguf_file,
115
+ use_4bit=use_4bit,
116
+ )
117
+
118
+ # Lazy-loaded components
119
+ self._database = None
120
+ self._embedder = None
121
+ self._cache: dict[str, Optional[CanonicalEntity]] = {}
122
+
123
+ @property
124
+ def name(self) -> str:
125
+ return "person_qualifier"
126
+
127
+ @property
128
+ def priority(self) -> int:
129
+ return 10 # High priority for PERSON entities
130
+
131
+ @property
132
+ def capabilities(self) -> PluginCapability:
133
+ caps = PluginCapability.CACHING
134
+ if self._use_llm:
135
+ caps |= PluginCapability.LLM_REQUIRED
136
+ return caps
137
+
138
+ @property
139
+ def description(self) -> str:
140
+ return "Extracts role and organization for PERSON entities, with optional database lookup for notable people"
141
+
142
+ @property
143
+ def supported_entity_types(self) -> set[EntityType]:
144
+ return {EntityType.PERSON}
145
+
146
+ @property
147
+ def provided_identifier_types(self) -> list[str]:
148
+ return ["wikidata_id"]
149
+
150
+ def _get_database(self):
151
+ """Get or initialize the person database."""
152
+ if self._database is not None:
153
+ return self._database
154
+
155
+ if not self._use_database:
156
+ return None
157
+
158
+ try:
159
+ from ...database.store import get_person_database
160
+ from ...database.hub import get_database_path
161
+
162
+ # Find database path
163
+ db_path = self._db_path
164
+ if db_path is None:
165
+ db_path = get_database_path(auto_download=self._auto_download_db)
166
+
167
+ if db_path is None:
168
+ logger.warning("Person database not available. Skipping database qualification.")
169
+ return None
170
+
171
+ # Use singleton to ensure database is only loaded once
172
+ self._database = get_person_database(db_path=db_path)
173
+ logger.info(f"Loaded person database from {db_path}")
174
+ return self._database
175
+
176
+ except Exception as e:
177
+ logger.warning(f"Failed to load person database: {e}")
178
+ return None
179
+
180
+ def _get_embedder(self):
181
+ """Get or initialize the embedder."""
182
+ if self._embedder is not None:
183
+ return self._embedder
184
+
185
+ try:
186
+ from ...database import CompanyEmbedder
187
+ self._embedder = CompanyEmbedder()
188
+ return self._embedder
189
+ except Exception as e:
190
+ logger.warning(f"Failed to load embedder: {e}")
191
+ return None
192
+
193
+ def _get_org_resolver(self):
194
+ """Get or initialize the organization resolver."""
195
+ if not hasattr(self, '_org_resolver'):
196
+ self._org_resolver = None
197
+
198
+ if self._org_resolver is not None:
199
+ return self._org_resolver
200
+
201
+ try:
202
+ from ...database.resolver import get_organization_resolver
203
+ self._org_resolver = get_organization_resolver(
204
+ db_path=self._db_path,
205
+ auto_download_db=self._auto_download_db,
206
+ )
207
+ return self._org_resolver
208
+ except Exception as e:
209
+ logger.warning(f"Failed to initialize organization resolver: {e}")
210
+ return None
211
+
212
+ def _resolve_organization(self, org_name: str) -> Optional[ResolvedOrganization]:
213
+ """
214
+ Resolve an organization name against the organization database.
215
+
216
+ Uses the shared OrganizationResolver utility.
217
+
218
+ Args:
219
+ org_name: Organization name to resolve
220
+
221
+ Returns:
222
+ ResolvedOrganization if found, None otherwise
223
+ """
224
+ resolver = self._get_org_resolver()
225
+ if resolver is None:
226
+ return None
227
+
228
+ return resolver.resolve(org_name)
229
+
230
+ def qualify(
231
+ self,
232
+ entity: ExtractedEntity,
233
+ context: PipelineContext,
234
+ ) -> Optional[CanonicalEntity]:
235
+ """
236
+ Qualify a PERSON entity with role, organization, and optionally canonical ID.
237
+
238
+ Args:
239
+ entity: The PERSON entity to qualify
240
+ context: Pipeline context for accessing source text
241
+
242
+ Returns:
243
+ CanonicalEntity with role/org qualifiers and FQN, or None if nothing found
244
+ """
245
+ if entity.type != EntityType.PERSON:
246
+ return None
247
+
248
+ # Check cache
249
+ cache_key = entity.text.lower().strip()
250
+ if cache_key in self._cache:
251
+ return self._cache[cache_key]
252
+
253
+ # Use the full source text for LLM qualification
254
+ full_text = context.source_text
255
+
256
+ # Step 1: Extract role and org using LLM or patterns
257
+ qualifiers: Optional[EntityQualifiers] = None
258
+ if self._llm is not None:
259
+ result = self._extract_with_llm(entity.text, full_text)
260
+ if result and (result.role or result.org):
261
+ qualifiers = result
262
+
263
+ # Fallback to pattern matching
264
+ if qualifiers is None:
265
+ qualifiers = self._extract_with_patterns(entity.text, full_text)
266
+
267
+ # Step 2: Search database for canonical match (if database is available)
268
+ canonical_match = None
269
+ if self._use_database:
270
+ canonical_match = self._search_database(
271
+ entity.text,
272
+ qualifiers.role if qualifiers else None,
273
+ qualifiers.org if qualifiers else None,
274
+ context,
275
+ )
276
+
277
+ # If no qualifiers found and no database match, return None
278
+ if qualifiers is None and canonical_match is None:
279
+ self._cache[cache_key] = None
280
+ return None
281
+
282
+ # Step 3: Build CanonicalEntity
283
+ result = self._build_canonical_entity(entity, qualifiers, canonical_match)
284
+ self._cache[cache_key] = result
285
+ return result
286
+
287
+ def _search_database(
288
+ self,
289
+ person_name: str,
290
+ extracted_role: Optional[str],
291
+ extracted_org: Optional[str],
292
+ context: PipelineContext,
293
+ ) -> Optional[CanonicalMatch]:
294
+ """
295
+ Search the person database for a canonical match.
296
+
297
+ Uses embedding similarity + role/org matching for disambiguation.
298
+
299
+ Args:
300
+ person_name: Name of the person
301
+ extracted_role: Role extracted from text (e.g., "CEO")
302
+ extracted_org: Organization extracted from text (e.g., "Apple Inc")
303
+ context: Pipeline context
304
+
305
+ Returns:
306
+ CanonicalMatch if a confident match is found, None otherwise
307
+ """
308
+ database = self._get_database()
309
+ if database is None:
310
+ return None
311
+
312
+ embedder = self._get_embedder()
313
+ if embedder is None:
314
+ return None
315
+
316
+ # Embed the person name
317
+ logger.debug(f" Embedding person name: '{person_name}'")
318
+ query_embedding = embedder.embed(person_name)
319
+
320
+ # Search database with text pre-filtering
321
+ logger.debug(f" Searching person database...")
322
+ results = database.search(
323
+ query_embedding,
324
+ top_k=self._top_k,
325
+ query_text=person_name,
326
+ )
327
+
328
+ # Filter by minimum similarity
329
+ results = [(r, s) for r, s in results if s >= self._min_similarity]
330
+
331
+ if not results:
332
+ logger.debug(f" No person matches found above threshold {self._min_similarity}")
333
+ return None
334
+
335
+ # Boost scores based on role/org matching
336
+ scored_results = []
337
+ for record, similarity in results:
338
+ boosted_score = self._compute_match_score(
339
+ record, similarity, extracted_role, extracted_org
340
+ )
341
+ scored_results.append((record, similarity, boosted_score))
342
+
343
+ # Sort by boosted score
344
+ scored_results.sort(key=lambda x: x[2], reverse=True)
345
+
346
+ # Log top candidates
347
+ logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
348
+ for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
349
+ role_str = f" ({record.known_for_role})" if record.known_for_role else ""
350
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
351
+ logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
352
+
353
+ # Select best match using LLM if available
354
+ logger.info(f" Selecting best match (LLM={self._llm is not None})...")
355
+ best_match = self._select_best_match(person_name, scored_results, extracted_role, extracted_org, context)
356
+
357
+ if best_match is None:
358
+ logger.info(f" No confident match for '{person_name}'")
359
+ return None
360
+
361
+ record, similarity, boosted = best_match
362
+ logger.info(f" Matched: '{record.name}' (wikidata:{record.source_id}, similarity={similarity:.3f})")
363
+
364
+ # Build canonical match
365
+ return CanonicalMatch(
366
+ canonical_id=f"wikidata:{record.source_id}",
367
+ canonical_name=record.name,
368
+ match_method="embedding",
369
+ match_confidence=min(max(boosted, 0.0), 1.0),
370
+ match_details={
371
+ "source": "wikidata",
372
+ "source_id": record.source_id,
373
+ "similarity": similarity,
374
+ "known_for_role": record.known_for_role,
375
+ "known_for_org": record.known_for_org,
376
+ },
377
+ )
378
+
379
+ def _compute_match_score(
380
+ self,
381
+ record,
382
+ embedding_similarity: float,
383
+ extracted_role: Optional[str],
384
+ extracted_org: Optional[str],
385
+ ) -> float:
386
+ """
387
+ Compute boosted match score using role/org context.
388
+
389
+ Boosts similarity score if extracted role/org matches database record.
390
+ """
391
+ score = embedding_similarity
392
+
393
+ # Boost if role matches (fuzzy)
394
+ if extracted_role and record.known_for_role:
395
+ if self._role_matches(extracted_role, record.known_for_role):
396
+ score += 0.1 # +10% boost
397
+ logger.debug(f" Role match boost: {extracted_role} ~ {record.known_for_role}")
398
+
399
+ # Boost if org matches (fuzzy)
400
+ if extracted_org and record.known_for_org:
401
+ if self._org_matches(extracted_org, record.known_for_org):
402
+ score += 0.15 # +15% boost (org is stronger signal)
403
+ logger.debug(f" Org match boost: {extracted_org} ~ {record.known_for_org}")
404
+
405
+ return min(score, 1.0) # Cap at 1.0
406
+
407
+ def _role_matches(self, extracted: str, known: str) -> bool:
408
+ """Fuzzy role matching."""
409
+ extracted_lower = extracted.lower().strip()
410
+ known_lower = known.lower().strip()
411
+
412
+ # Exact match
413
+ if extracted_lower == known_lower:
414
+ return True
415
+
416
+ # CEO variants
417
+ ceo_variants = {"ceo", "chief executive", "chief executive officer"}
418
+ if extracted_lower in ceo_variants and known_lower in ceo_variants:
419
+ return True
420
+
421
+ # CFO variants
422
+ cfo_variants = {"cfo", "chief financial officer"}
423
+ if extracted_lower in cfo_variants and known_lower in cfo_variants:
424
+ return True
425
+
426
+ # President variants
427
+ president_variants = {"president", "chairman", "chairman and ceo"}
428
+ if extracted_lower in president_variants and known_lower in president_variants:
429
+ return True
430
+
431
+ # Founder variants
432
+ founder_variants = {"founder", "co-founder", "cofounder", "founding member"}
433
+ if extracted_lower in founder_variants and known_lower in founder_variants:
434
+ return True
435
+
436
+ # Contains check for partial matches
437
+ if extracted_lower in known_lower or known_lower in extracted_lower:
438
+ return True
439
+
440
+ return False
441
+
442
+ def _org_matches(self, extracted: str, known: str) -> bool:
443
+ """Fuzzy org matching using simple normalization."""
444
+ # Normalize both
445
+ extracted_norm = self._normalize_org_name(extracted)
446
+ known_norm = self._normalize_org_name(known)
447
+
448
+ # Exact normalized match
449
+ if extracted_norm == known_norm:
450
+ return True
451
+
452
+ # Check if one contains the other (e.g., "Apple" in "Apple Inc")
453
+ if extracted_norm in known_norm or known_norm in extracted_norm:
454
+ return True
455
+
456
+ return False
457
+
458
+ def _normalize_org_name(self, name: str) -> str:
459
+ """Simple org name normalization."""
460
+ # Lowercase
461
+ normalized = name.lower().strip()
462
+
463
+ # Remove common suffixes
464
+ suffixes = [
465
+ " inc.", " inc", " corp.", " corp", " corporation",
466
+ " ltd.", " ltd", " limited", " llc", " plc",
467
+ " co.", " co", " company",
468
+ ]
469
+ for suffix in suffixes:
470
+ if normalized.endswith(suffix):
471
+ normalized = normalized[:-len(suffix)]
472
+
473
+ return normalized.strip()
474
+
475
+ def _select_best_match(
476
+ self,
477
+ query_name: str,
478
+ candidates: list[tuple],
479
+ extracted_role: Optional[str],
480
+ extracted_org: Optional[str],
481
+ context: PipelineContext,
482
+ ) -> Optional[tuple]:
483
+ """
484
+ Select the best match from candidates.
485
+
486
+ Uses LLM if available, otherwise returns top match if confidence is high enough.
487
+ """
488
+ if not candidates:
489
+ return None
490
+
491
+ # If only one strong match, use it directly
492
+ if len(candidates) == 1 and candidates[0][2] >= 0.9:
493
+ logger.info(f" Single strong match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
494
+ return candidates[0]
495
+
496
+ # Try LLM confirmation
497
+ if self._llm is not None:
498
+ try:
499
+ return self._llm_select_match(query_name, candidates, extracted_role, extracted_org, context)
500
+ except Exception as e:
501
+ logger.warning(f" LLM confirmation failed: {e}")
502
+
503
+ # Fallback: use top match if boosted score is high enough
504
+ top_record, top_similarity, top_boosted = candidates[0]
505
+ if top_boosted >= 0.85:
506
+ logger.info(f" No LLM, using top match: '{top_record.name}' (boosted={top_boosted:.3f})")
507
+ return candidates[0]
508
+
509
+ logger.info(f" No confident match (top boosted={top_boosted:.3f} < 0.85)")
510
+ return None
511
+
512
+ def _llm_select_match(
513
+ self,
514
+ query_name: str,
515
+ candidates: list[tuple],
516
+ extracted_role: Optional[str],
517
+ extracted_org: Optional[str],
518
+ context: PipelineContext,
519
+ ) -> Optional[tuple]:
520
+ """Use LLM to select the best match."""
521
+ # Format candidates for prompt
522
+ candidate_lines = []
523
+ for i, (record, similarity, boosted) in enumerate(candidates[:10], 1):
524
+ role_str = f", {record.known_for_role}" if record.known_for_role else ""
525
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
526
+ country_str = f", {record.country}" if record.country else ""
527
+ candidate_lines.append(
528
+ f"{i}. {record.name}{role_str}{org_str}{country_str} (score: {boosted:.2f})"
529
+ )
530
+
531
+ # Build context info from extracted role/org
532
+ context_parts = []
533
+ if extracted_role:
534
+ context_parts.append(f"role={extracted_role}")
535
+ if extracted_org:
536
+ context_parts.append(f"org={extracted_org}")
537
+ context_info = ", ".join(context_parts) if context_parts else "no role/org extracted"
538
+
539
+ # Source text preview
540
+ source_preview = ""
541
+ if context.source_text:
542
+ source_preview = context.source_text[:300] + "..." if len(context.source_text) > 300 else context.source_text
543
+
544
+ prompt = PERSON_MATCH_PROMPT.format(
545
+ query_name=query_name,
546
+ context_info=context_info,
547
+ source_preview=source_preview,
548
+ candidates="\n".join(candidate_lines),
549
+ )
550
+
551
+ # Get LLM response
552
+ assert self._llm is not None
553
+ response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
554
+ response = response.strip()
555
+
556
+ logger.info(f" LLM response for '{query_name}': {response}")
557
+
558
+ # Parse response
559
+ if response.upper() == "NONE":
560
+ logger.info(f" LLM chose: NONE (no match)")
561
+ return None
562
+
563
+ try:
564
+ idx = int(response) - 1
565
+ if 0 <= idx < len(candidates):
566
+ chosen = candidates[idx]
567
+ logger.info(f" LLM chose: #{idx + 1} '{chosen[0].name}' (boosted={chosen[2]:.3f})")
568
+ return chosen
569
+ except ValueError:
570
+ logger.warning(f" LLM response '{response}' could not be parsed as number")
571
+
572
+ # Fallback to top match if LLM response is unclear and score is decent
573
+ if candidates[0][2] >= 0.8:
574
+ logger.info(f" Fallback to top match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
575
+ return candidates[0]
576
+
577
+ logger.info(f" No confident match (top boosted={candidates[0][2]:.3f} < 0.8)")
578
+ return None
579
+
580
+ def _extract_with_llm(
581
+ self,
582
+ person_name: str,
583
+ context_text: str,
584
+ ) -> Optional[EntityQualifiers]:
585
+ """Extract role and org using Gemma3."""
586
+ if self._llm is None:
587
+ return None
588
+
589
+ try:
590
+ prompt = f"""Extract qualifiers for a person from the given context.
591
+ Instructions:
592
+ - "role" = job title or position (e.g., "CEO", "President", "Director")
593
+ - "org" = company or organization name (e.g., "Amazon", "Apple Inc", "Microsoft")
594
+ - These are DIFFERENT things: role is a job title, org is a company name
595
+ - Return null for fields not mentioned in the context
596
+
597
+ Return ONLY valid JSON:
598
+
599
+ E.g.
600
+ <context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
601
+ <person>James</person>
602
+
603
+ Should return:
604
+
605
+ {{"role": "CEO", "org": "Big Ducks Quacking Inc"}}
606
+
607
+ ---
608
+
609
+ <context>{context_text}</context>
610
+ <person>{person_name}</person>
611
+ """
612
+
613
+ logger.debug(f"LLM request: {prompt}")
614
+ response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
615
+ logger.debug(f"LLM response: {response}")
616
+
617
+ # Extract JSON from response
618
+ json_match = re.search(r'\{[^}]+\}', response)
619
+ if json_match:
620
+ data = json.loads(json_match.group())
621
+ role = data.get("role")
622
+ org = data.get("org")
623
+
624
+ # Validate: role and org should be different (reject if same)
625
+ if role and org and role.lower() == org.lower():
626
+ logger.debug(f"Rejected duplicate role/org: {role}")
627
+ org = None # Clear org if it's same as role
628
+
629
+ if role or org:
630
+ return EntityQualifiers(role=role, org=org)
631
+
632
+ except Exception as e:
633
+ logger.exception(f"LLM extraction failed: {e}")
634
+ raise e
635
+
636
+ return None
637
+
638
+ def _extract_with_patterns(
639
+ self,
640
+ person_name: str,
641
+ context_text: str,
642
+ ) -> Optional[EntityQualifiers]:
643
+ """Extract role and org using pattern matching."""
644
+ role = None
645
+ org = None
646
+
647
+ # Look for role patterns
648
+ for pattern in self.ROLE_PATTERNS:
649
+ match = re.search(pattern, context_text, re.IGNORECASE)
650
+ if match:
651
+ role = match.group(1)
652
+ break
653
+
654
+ # Look for "of [Organization]" or "at [Organization]" patterns
655
+ org_patterns = [
656
+ rf'{re.escape(person_name)}[^.]*?\bof\s+([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)',
657
+ rf'{re.escape(person_name)}[^.]*?\bat\s+([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)',
658
+ rf'([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)\s*(?:\'s|s)?\s*{re.escape(person_name)}',
659
+ ]
660
+
661
+ for pattern in org_patterns:
662
+ match = re.search(pattern, context_text)
663
+ if match:
664
+ org = match.group(1).strip()
665
+ # Clean up trailing punctuation
666
+ org = org.rstrip('.,;')
667
+ break
668
+
669
+ if role or org:
670
+ return EntityQualifiers(role=role, org=org)
671
+
672
+ return None
673
+
674
+ def _build_canonical_entity(
675
+ self,
676
+ entity: ExtractedEntity,
677
+ qualifiers: Optional[EntityQualifiers],
678
+ canonical_match: Optional[CanonicalMatch],
679
+ ) -> CanonicalEntity:
680
+ """Build CanonicalEntity from qualifiers and optional canonical match."""
681
+ # Ensure qualifiers is not None
682
+ if qualifiers is None:
683
+ qualifiers = EntityQualifiers()
684
+
685
+ # If we have a canonical match, add wikidata ID to identifiers
686
+ identifiers: dict[str, str] = dict(qualifiers.identifiers) if qualifiers.identifiers else {}
687
+ resolved_role: Optional[ResolvedRole] = None
688
+ resolved_org: Optional[ResolvedOrganization] = None
689
+
690
+ if canonical_match:
691
+ match_details = canonical_match.match_details or {}
692
+ source_id = str(match_details.get("source_id", ""))
693
+ if source_id:
694
+ identifiers["wikidata_id"] = source_id
695
+ if canonical_match.canonical_id:
696
+ identifiers["canonical_id"] = canonical_match.canonical_id
697
+
698
+ # Extract role and org from database match
699
+ known_role = str(match_details.get("known_for_role", "") or "")
700
+ known_org = str(match_details.get("known_for_org", "") or "")
701
+
702
+ # Create ResolvedRole from database match
703
+ if known_role:
704
+ resolved_role = ResolvedRole(
705
+ canonical_name=known_role,
706
+ canonical_id=None, # Role ID would need separate lookup
707
+ source="wikidata",
708
+ source_id=source_id if source_id else None,
709
+ )
710
+
711
+ # Update qualifiers with info from database if not already set
712
+ if not qualifiers.role and known_role:
713
+ final_role = known_role
714
+ final_org = qualifiers.org or known_org or None
715
+ else:
716
+ final_role = qualifiers.role
717
+ final_org = qualifiers.org
718
+ else:
719
+ final_role = qualifiers.role
720
+ final_org = qualifiers.org
721
+
722
+ # Resolve organization against the organization database
723
+ org_to_resolve = final_org
724
+ if org_to_resolve:
725
+ logger.debug(f" Resolving organization: '{org_to_resolve}'")
726
+ resolved_org = self._resolve_organization(org_to_resolve)
727
+ if resolved_org:
728
+ logger.info(f" Resolved org: '{org_to_resolve}' -> '{resolved_org.canonical_name}' ({resolved_org.canonical_id})")
729
+
730
+ # Build the final qualifiers with resolved info
731
+ qualifiers = EntityQualifiers(
732
+ role=final_role,
733
+ org=final_org,
734
+ identifiers=identifiers,
735
+ resolved_role=resolved_role,
736
+ resolved_org=resolved_org,
737
+ )
738
+
739
+ # Create QualifiedEntity
740
+ qualified = QualifiedEntity(
741
+ entity_ref=entity.entity_ref,
742
+ original_text=entity.text,
743
+ entity_type=entity.type,
744
+ qualifiers=qualifiers,
745
+ qualification_sources=[self.name],
746
+ )
747
+
748
+ # Build FQN - prefer resolved names when available
749
+ if canonical_match and canonical_match.canonical_name:
750
+ # Use canonical person name from database
751
+ fqn_parts: list[str] = [canonical_match.canonical_name]
752
+ if qualifiers.role:
753
+ fqn_parts.append(f"({qualifiers.role})")
754
+ # Use resolved org name if available
755
+ if resolved_org:
756
+ fqn_parts.append(f"at {resolved_org.canonical_name}")
757
+ elif qualifiers.org:
758
+ fqn_parts.append(f"at {qualifiers.org}")
759
+ fqn = " ".join(fqn_parts)
760
+ else:
761
+ # Build FQN: "Person Name (Role, Org)" or "Person Name (Role)" or "Person Name (Org)"
762
+ fqn_parts_for_display: list[str] = []
763
+ if qualifiers.role:
764
+ fqn_parts_for_display.append(qualifiers.role)
765
+ # Use resolved org name if available
766
+ if resolved_org:
767
+ fqn_parts_for_display.append(resolved_org.canonical_name)
768
+ elif qualifiers.org:
769
+ fqn_parts_for_display.append(qualifiers.org)
770
+
771
+ if fqn_parts_for_display:
772
+ fqn = f"{entity.text} ({', '.join(fqn_parts_for_display)})"
773
+ else:
774
+ fqn = entity.text
775
+
776
+ return CanonicalEntity(
777
+ entity_ref=entity.entity_ref,
778
+ qualified_entity=qualified,
779
+ canonical_match=canonical_match,
780
+ fqn=fqn,
781
+ )
782
+
783
+
784
+ # Allow importing without decorator for testing
785
+ PersonQualifierPluginClass = PersonQualifierPlugin