corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
|
-
PersonQualifierPlugin - Qualifies PERSON entities with role and
|
|
2
|
+
PersonQualifierPlugin - Qualifies PERSON entities with role, organization, and canonical ID.
|
|
3
3
|
|
|
4
4
|
Uses Gemma3 12B (instruction-tuned) to extract:
|
|
5
5
|
- role: Job title/position (e.g., "CEO", "President")
|
|
6
6
|
- org: Organization/employer (e.g., "Apple Inc", "Microsoft")
|
|
7
|
+
|
|
8
|
+
Then searches the person database to find canonical matches for notable people
|
|
9
|
+
(those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
|
|
7
10
|
"""
|
|
8
11
|
|
|
9
12
|
import json
|
|
@@ -14,19 +17,51 @@ from typing import Optional
|
|
|
14
17
|
from ..base import BaseQualifierPlugin, PluginCapability
|
|
15
18
|
from ...pipeline.context import PipelineContext
|
|
16
19
|
from ...pipeline.registry import PluginRegistry
|
|
17
|
-
from ...models import
|
|
20
|
+
from ...models import (
|
|
21
|
+
ExtractedEntity,
|
|
22
|
+
EntityQualifiers,
|
|
23
|
+
EntityType,
|
|
24
|
+
QualifiedEntity,
|
|
25
|
+
CanonicalEntity,
|
|
26
|
+
CanonicalMatch,
|
|
27
|
+
ResolvedRole,
|
|
28
|
+
ResolvedOrganization,
|
|
29
|
+
)
|
|
18
30
|
from ...llm import LLM
|
|
19
31
|
|
|
20
32
|
logger = logging.getLogger(__name__)
|
|
21
33
|
|
|
22
34
|
|
|
35
|
+
# LLM prompt template for person matching confirmation
|
|
36
|
+
PERSON_MATCH_PROMPT = """You are matching a person name extracted from text to a database of notable people.
|
|
37
|
+
|
|
38
|
+
Extracted name: "{query_name}"
|
|
39
|
+
Context from text: {context_info}
|
|
40
|
+
Source text: "{source_preview}"
|
|
41
|
+
|
|
42
|
+
Candidates from database (with Wikipedia info):
|
|
43
|
+
{candidates}
|
|
44
|
+
|
|
45
|
+
Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
|
|
46
|
+
|
|
47
|
+
Rules:
|
|
48
|
+
- The match should refer to the same person
|
|
49
|
+
- Consider whether the role and organization from the text match the Wikipedia info
|
|
50
|
+
- Different people with similar names should NOT match
|
|
51
|
+
- If the extracted name is too generic or ambiguous, respond "NONE"
|
|
52
|
+
|
|
53
|
+
Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
|
|
23
57
|
@PluginRegistry.qualifier
|
|
24
58
|
class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
25
59
|
"""
|
|
26
60
|
Qualifier plugin for PERSON entities.
|
|
27
61
|
|
|
28
62
|
Uses Gemma3 12B to extract role and organization from context.
|
|
29
|
-
|
|
63
|
+
Then searches the person database to find canonical matches for notable people.
|
|
64
|
+
Falls back to pattern matching if LLM is not available.
|
|
30
65
|
"""
|
|
31
66
|
|
|
32
67
|
# Common role patterns for fallback
|
|
@@ -45,6 +80,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
45
80
|
gguf_file: Optional[str] = None,
|
|
46
81
|
use_llm: bool = True,
|
|
47
82
|
use_4bit: bool = True,
|
|
83
|
+
use_database: bool = True,
|
|
84
|
+
db_path: Optional[str] = None,
|
|
85
|
+
top_k: int = 10,
|
|
86
|
+
min_similarity: float = 0.5,
|
|
87
|
+
auto_download_db: bool = True,
|
|
48
88
|
):
|
|
49
89
|
"""
|
|
50
90
|
Initialize the person qualifier.
|
|
@@ -54,8 +94,19 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
54
94
|
gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
|
|
55
95
|
use_llm: Whether to use LLM
|
|
56
96
|
use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
|
|
97
|
+
use_database: Whether to use person database for canonical matching
|
|
98
|
+
db_path: Path to database (auto-detects if None)
|
|
99
|
+
top_k: Number of candidates to retrieve from database
|
|
100
|
+
min_similarity: Minimum similarity threshold for database matches
|
|
101
|
+
auto_download_db: Whether to auto-download database from HuggingFace
|
|
57
102
|
"""
|
|
58
103
|
self._use_llm = use_llm
|
|
104
|
+
self._use_database = use_database
|
|
105
|
+
self._db_path = db_path
|
|
106
|
+
self._top_k = top_k
|
|
107
|
+
self._min_similarity = min_similarity
|
|
108
|
+
self._auto_download_db = auto_download_db
|
|
109
|
+
|
|
59
110
|
self._llm: Optional[LLM] = None
|
|
60
111
|
if use_llm:
|
|
61
112
|
self._llm = LLM(
|
|
@@ -64,6 +115,11 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
64
115
|
use_4bit=use_4bit,
|
|
65
116
|
)
|
|
66
117
|
|
|
118
|
+
# Lazy-loaded components
|
|
119
|
+
self._database = None
|
|
120
|
+
self._embedder = None
|
|
121
|
+
self._cache: dict[str, Optional[CanonicalEntity]] = {}
|
|
122
|
+
|
|
67
123
|
@property
|
|
68
124
|
def name(self) -> str:
|
|
69
125
|
return "person_qualifier"
|
|
@@ -74,14 +130,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
74
130
|
|
|
75
131
|
@property
|
|
76
132
|
def capabilities(self) -> PluginCapability:
|
|
77
|
-
caps = PluginCapability.
|
|
133
|
+
caps = PluginCapability.CACHING
|
|
78
134
|
if self._use_llm:
|
|
79
135
|
caps |= PluginCapability.LLM_REQUIRED
|
|
80
136
|
return caps
|
|
81
137
|
|
|
82
138
|
@property
|
|
83
139
|
def description(self) -> str:
|
|
84
|
-
return "Extracts role and organization for PERSON entities
|
|
140
|
+
return "Extracts role and organization for PERSON entities, with optional database lookup for notable people"
|
|
85
141
|
|
|
86
142
|
@property
|
|
87
143
|
def supported_entity_types(self) -> set[EntityType]:
|
|
@@ -89,38 +145,437 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
89
145
|
|
|
90
146
|
@property
|
|
91
147
|
def provided_identifier_types(self) -> list[str]:
|
|
92
|
-
return []
|
|
148
|
+
return ["wikidata_id"]
|
|
149
|
+
|
|
150
|
+
def _get_database(self):
|
|
151
|
+
"""Get or initialize the person database."""
|
|
152
|
+
if self._database is not None:
|
|
153
|
+
return self._database
|
|
154
|
+
|
|
155
|
+
if not self._use_database:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
from ...database.store import get_person_database
|
|
160
|
+
from ...database.hub import get_database_path
|
|
161
|
+
|
|
162
|
+
# Find database path
|
|
163
|
+
db_path = self._db_path
|
|
164
|
+
if db_path is None:
|
|
165
|
+
db_path = get_database_path(auto_download=self._auto_download_db)
|
|
166
|
+
|
|
167
|
+
if db_path is None:
|
|
168
|
+
logger.warning("Person database not available. Skipping database qualification.")
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Use singleton to ensure database is only loaded once
|
|
172
|
+
self._database = get_person_database(db_path=db_path)
|
|
173
|
+
logger.info(f"Loaded person database from {db_path}")
|
|
174
|
+
return self._database
|
|
175
|
+
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.warning(f"Failed to load person database: {e}")
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
def _get_embedder(self):
|
|
181
|
+
"""Get or initialize the embedder."""
|
|
182
|
+
if self._embedder is not None:
|
|
183
|
+
return self._embedder
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
from ...database import CompanyEmbedder
|
|
187
|
+
self._embedder = CompanyEmbedder()
|
|
188
|
+
return self._embedder
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.warning(f"Failed to load embedder: {e}")
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
def _get_org_resolver(self):
|
|
194
|
+
"""Get or initialize the organization resolver."""
|
|
195
|
+
if not hasattr(self, '_org_resolver'):
|
|
196
|
+
self._org_resolver = None
|
|
197
|
+
|
|
198
|
+
if self._org_resolver is not None:
|
|
199
|
+
return self._org_resolver
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
from ...database.resolver import get_organization_resolver
|
|
203
|
+
self._org_resolver = get_organization_resolver(
|
|
204
|
+
db_path=self._db_path,
|
|
205
|
+
auto_download_db=self._auto_download_db,
|
|
206
|
+
)
|
|
207
|
+
return self._org_resolver
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.warning(f"Failed to initialize organization resolver: {e}")
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
def _resolve_organization(self, org_name: str) -> Optional[ResolvedOrganization]:
|
|
213
|
+
"""
|
|
214
|
+
Resolve an organization name against the organization database.
|
|
215
|
+
|
|
216
|
+
Uses the shared OrganizationResolver utility.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
org_name: Organization name to resolve
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
ResolvedOrganization if found, None otherwise
|
|
223
|
+
"""
|
|
224
|
+
resolver = self._get_org_resolver()
|
|
225
|
+
if resolver is None:
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
return resolver.resolve(org_name)
|
|
93
229
|
|
|
94
230
|
def qualify(
|
|
95
231
|
self,
|
|
96
232
|
entity: ExtractedEntity,
|
|
97
233
|
context: PipelineContext,
|
|
98
|
-
) -> Optional[
|
|
234
|
+
) -> Optional[CanonicalEntity]:
|
|
99
235
|
"""
|
|
100
|
-
Qualify a PERSON entity with role and
|
|
236
|
+
Qualify a PERSON entity with role, organization, and optionally canonical ID.
|
|
101
237
|
|
|
102
238
|
Args:
|
|
103
239
|
entity: The PERSON entity to qualify
|
|
104
240
|
context: Pipeline context for accessing source text
|
|
105
241
|
|
|
106
242
|
Returns:
|
|
107
|
-
|
|
243
|
+
CanonicalEntity with role/org qualifiers and FQN, or None if nothing found
|
|
108
244
|
"""
|
|
109
245
|
if entity.type != EntityType.PERSON:
|
|
110
246
|
return None
|
|
111
247
|
|
|
248
|
+
# Check cache
|
|
249
|
+
cache_key = entity.text.lower().strip()
|
|
250
|
+
if cache_key in self._cache:
|
|
251
|
+
return self._cache[cache_key]
|
|
252
|
+
|
|
112
253
|
# Use the full source text for LLM qualification
|
|
113
|
-
# This provides maximum context for understanding the person's role/org
|
|
114
254
|
full_text = context.source_text
|
|
115
255
|
|
|
116
|
-
#
|
|
256
|
+
# Step 1: Extract role and org using LLM or patterns
|
|
257
|
+
qualifiers: Optional[EntityQualifiers] = None
|
|
117
258
|
if self._llm is not None:
|
|
118
259
|
result = self._extract_with_llm(entity.text, full_text)
|
|
119
260
|
if result and (result.role or result.org):
|
|
120
|
-
|
|
261
|
+
qualifiers = result
|
|
262
|
+
|
|
263
|
+
# Fallback to pattern matching
|
|
264
|
+
if qualifiers is None:
|
|
265
|
+
qualifiers = self._extract_with_patterns(entity.text, full_text)
|
|
266
|
+
|
|
267
|
+
# Step 2: Search database for canonical match (if database is available)
|
|
268
|
+
canonical_match = None
|
|
269
|
+
if self._use_database:
|
|
270
|
+
canonical_match = self._search_database(
|
|
271
|
+
entity.text,
|
|
272
|
+
qualifiers.role if qualifiers else None,
|
|
273
|
+
qualifiers.org if qualifiers else None,
|
|
274
|
+
context,
|
|
275
|
+
)
|
|
121
276
|
|
|
122
|
-
#
|
|
123
|
-
|
|
277
|
+
# If no qualifiers found and no database match, return None
|
|
278
|
+
if qualifiers is None and canonical_match is None:
|
|
279
|
+
self._cache[cache_key] = None
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
# Step 3: Build CanonicalEntity
|
|
283
|
+
result = self._build_canonical_entity(entity, qualifiers, canonical_match)
|
|
284
|
+
self._cache[cache_key] = result
|
|
285
|
+
return result
|
|
286
|
+
|
|
287
|
+
def _search_database(
|
|
288
|
+
self,
|
|
289
|
+
person_name: str,
|
|
290
|
+
extracted_role: Optional[str],
|
|
291
|
+
extracted_org: Optional[str],
|
|
292
|
+
context: PipelineContext,
|
|
293
|
+
) -> Optional[CanonicalMatch]:
|
|
294
|
+
"""
|
|
295
|
+
Search the person database for a canonical match.
|
|
296
|
+
|
|
297
|
+
Uses embedding similarity + role/org matching for disambiguation.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
person_name: Name of the person
|
|
301
|
+
extracted_role: Role extracted from text (e.g., "CEO")
|
|
302
|
+
extracted_org: Organization extracted from text (e.g., "Apple Inc")
|
|
303
|
+
context: Pipeline context
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
CanonicalMatch if a confident match is found, None otherwise
|
|
307
|
+
"""
|
|
308
|
+
database = self._get_database()
|
|
309
|
+
if database is None:
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
embedder = self._get_embedder()
|
|
313
|
+
if embedder is None:
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
# Embed the person name
|
|
317
|
+
logger.debug(f" Embedding person name: '{person_name}'")
|
|
318
|
+
query_embedding = embedder.embed(person_name)
|
|
319
|
+
|
|
320
|
+
# Search database with text pre-filtering
|
|
321
|
+
logger.debug(f" Searching person database...")
|
|
322
|
+
results = database.search(
|
|
323
|
+
query_embedding,
|
|
324
|
+
top_k=self._top_k,
|
|
325
|
+
query_text=person_name,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Filter by minimum similarity
|
|
329
|
+
results = [(r, s) for r, s in results if s >= self._min_similarity]
|
|
330
|
+
|
|
331
|
+
if not results:
|
|
332
|
+
logger.debug(f" No person matches found above threshold {self._min_similarity}")
|
|
333
|
+
return None
|
|
334
|
+
|
|
335
|
+
# Boost scores based on role/org matching
|
|
336
|
+
scored_results = []
|
|
337
|
+
for record, similarity in results:
|
|
338
|
+
boosted_score = self._compute_match_score(
|
|
339
|
+
record, similarity, extracted_role, extracted_org
|
|
340
|
+
)
|
|
341
|
+
scored_results.append((record, similarity, boosted_score))
|
|
342
|
+
|
|
343
|
+
# Sort by boosted score
|
|
344
|
+
scored_results.sort(key=lambda x: x[2], reverse=True)
|
|
345
|
+
|
|
346
|
+
# Log top candidates
|
|
347
|
+
logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
|
|
348
|
+
for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
|
|
349
|
+
role_str = f" ({record.known_for_role})" if record.known_for_role else ""
|
|
350
|
+
org_str = f" at {record.known_for_org}" if record.known_for_org else ""
|
|
351
|
+
logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
|
|
352
|
+
|
|
353
|
+
# Select best match using LLM if available
|
|
354
|
+
logger.info(f" Selecting best match (LLM={self._llm is not None})...")
|
|
355
|
+
best_match = self._select_best_match(person_name, scored_results, extracted_role, extracted_org, context)
|
|
356
|
+
|
|
357
|
+
if best_match is None:
|
|
358
|
+
logger.info(f" No confident match for '{person_name}'")
|
|
359
|
+
return None
|
|
360
|
+
|
|
361
|
+
record, similarity, boosted = best_match
|
|
362
|
+
logger.info(f" Matched: '{record.name}' (wikidata:{record.source_id}, similarity={similarity:.3f})")
|
|
363
|
+
|
|
364
|
+
# Build canonical match
|
|
365
|
+
return CanonicalMatch(
|
|
366
|
+
canonical_id=f"wikidata:{record.source_id}",
|
|
367
|
+
canonical_name=record.name,
|
|
368
|
+
match_method="embedding",
|
|
369
|
+
match_confidence=min(max(boosted, 0.0), 1.0),
|
|
370
|
+
match_details={
|
|
371
|
+
"source": "wikidata",
|
|
372
|
+
"source_id": record.source_id,
|
|
373
|
+
"similarity": similarity,
|
|
374
|
+
"known_for_role": record.known_for_role,
|
|
375
|
+
"known_for_org": record.known_for_org,
|
|
376
|
+
},
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
def _compute_match_score(
|
|
380
|
+
self,
|
|
381
|
+
record,
|
|
382
|
+
embedding_similarity: float,
|
|
383
|
+
extracted_role: Optional[str],
|
|
384
|
+
extracted_org: Optional[str],
|
|
385
|
+
) -> float:
|
|
386
|
+
"""
|
|
387
|
+
Compute boosted match score using role/org context.
|
|
388
|
+
|
|
389
|
+
Boosts similarity score if extracted role/org matches database record.
|
|
390
|
+
"""
|
|
391
|
+
score = embedding_similarity
|
|
392
|
+
|
|
393
|
+
# Boost if role matches (fuzzy)
|
|
394
|
+
if extracted_role and record.known_for_role:
|
|
395
|
+
if self._role_matches(extracted_role, record.known_for_role):
|
|
396
|
+
score += 0.1 # +10% boost
|
|
397
|
+
logger.debug(f" Role match boost: {extracted_role} ~ {record.known_for_role}")
|
|
398
|
+
|
|
399
|
+
# Boost if org matches (fuzzy)
|
|
400
|
+
if extracted_org and record.known_for_org:
|
|
401
|
+
if self._org_matches(extracted_org, record.known_for_org):
|
|
402
|
+
score += 0.15 # +15% boost (org is stronger signal)
|
|
403
|
+
logger.debug(f" Org match boost: {extracted_org} ~ {record.known_for_org}")
|
|
404
|
+
|
|
405
|
+
return min(score, 1.0) # Cap at 1.0
|
|
406
|
+
|
|
407
|
+
def _role_matches(self, extracted: str, known: str) -> bool:
|
|
408
|
+
"""Fuzzy role matching."""
|
|
409
|
+
extracted_lower = extracted.lower().strip()
|
|
410
|
+
known_lower = known.lower().strip()
|
|
411
|
+
|
|
412
|
+
# Exact match
|
|
413
|
+
if extracted_lower == known_lower:
|
|
414
|
+
return True
|
|
415
|
+
|
|
416
|
+
# CEO variants
|
|
417
|
+
ceo_variants = {"ceo", "chief executive", "chief executive officer"}
|
|
418
|
+
if extracted_lower in ceo_variants and known_lower in ceo_variants:
|
|
419
|
+
return True
|
|
420
|
+
|
|
421
|
+
# CFO variants
|
|
422
|
+
cfo_variants = {"cfo", "chief financial officer"}
|
|
423
|
+
if extracted_lower in cfo_variants and known_lower in cfo_variants:
|
|
424
|
+
return True
|
|
425
|
+
|
|
426
|
+
# President variants
|
|
427
|
+
president_variants = {"president", "chairman", "chairman and ceo"}
|
|
428
|
+
if extracted_lower in president_variants and known_lower in president_variants:
|
|
429
|
+
return True
|
|
430
|
+
|
|
431
|
+
# Founder variants
|
|
432
|
+
founder_variants = {"founder", "co-founder", "cofounder", "founding member"}
|
|
433
|
+
if extracted_lower in founder_variants and known_lower in founder_variants:
|
|
434
|
+
return True
|
|
435
|
+
|
|
436
|
+
# Contains check for partial matches
|
|
437
|
+
if extracted_lower in known_lower or known_lower in extracted_lower:
|
|
438
|
+
return True
|
|
439
|
+
|
|
440
|
+
return False
|
|
441
|
+
|
|
442
|
+
def _org_matches(self, extracted: str, known: str) -> bool:
|
|
443
|
+
"""Fuzzy org matching using simple normalization."""
|
|
444
|
+
# Normalize both
|
|
445
|
+
extracted_norm = self._normalize_org_name(extracted)
|
|
446
|
+
known_norm = self._normalize_org_name(known)
|
|
447
|
+
|
|
448
|
+
# Exact normalized match
|
|
449
|
+
if extracted_norm == known_norm:
|
|
450
|
+
return True
|
|
451
|
+
|
|
452
|
+
# Check if one contains the other (e.g., "Apple" in "Apple Inc")
|
|
453
|
+
if extracted_norm in known_norm or known_norm in extracted_norm:
|
|
454
|
+
return True
|
|
455
|
+
|
|
456
|
+
return False
|
|
457
|
+
|
|
458
|
+
def _normalize_org_name(self, name: str) -> str:
|
|
459
|
+
"""Simple org name normalization."""
|
|
460
|
+
# Lowercase
|
|
461
|
+
normalized = name.lower().strip()
|
|
462
|
+
|
|
463
|
+
# Remove common suffixes
|
|
464
|
+
suffixes = [
|
|
465
|
+
" inc.", " inc", " corp.", " corp", " corporation",
|
|
466
|
+
" ltd.", " ltd", " limited", " llc", " plc",
|
|
467
|
+
" co.", " co", " company",
|
|
468
|
+
]
|
|
469
|
+
for suffix in suffixes:
|
|
470
|
+
if normalized.endswith(suffix):
|
|
471
|
+
normalized = normalized[:-len(suffix)]
|
|
472
|
+
|
|
473
|
+
return normalized.strip()
|
|
474
|
+
|
|
475
|
+
def _select_best_match(
|
|
476
|
+
self,
|
|
477
|
+
query_name: str,
|
|
478
|
+
candidates: list[tuple],
|
|
479
|
+
extracted_role: Optional[str],
|
|
480
|
+
extracted_org: Optional[str],
|
|
481
|
+
context: PipelineContext,
|
|
482
|
+
) -> Optional[tuple]:
|
|
483
|
+
"""
|
|
484
|
+
Select the best match from candidates.
|
|
485
|
+
|
|
486
|
+
Uses LLM if available, otherwise returns top match if confidence is high enough.
|
|
487
|
+
"""
|
|
488
|
+
if not candidates:
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
# If only one strong match, use it directly
|
|
492
|
+
if len(candidates) == 1 and candidates[0][2] >= 0.9:
|
|
493
|
+
logger.info(f" Single strong match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
|
|
494
|
+
return candidates[0]
|
|
495
|
+
|
|
496
|
+
# Try LLM confirmation
|
|
497
|
+
if self._llm is not None:
|
|
498
|
+
try:
|
|
499
|
+
return self._llm_select_match(query_name, candidates, extracted_role, extracted_org, context)
|
|
500
|
+
except Exception as e:
|
|
501
|
+
logger.warning(f" LLM confirmation failed: {e}")
|
|
502
|
+
|
|
503
|
+
# Fallback: use top match if boosted score is high enough
|
|
504
|
+
top_record, top_similarity, top_boosted = candidates[0]
|
|
505
|
+
if top_boosted >= 0.85:
|
|
506
|
+
logger.info(f" No LLM, using top match: '{top_record.name}' (boosted={top_boosted:.3f})")
|
|
507
|
+
return candidates[0]
|
|
508
|
+
|
|
509
|
+
logger.info(f" No confident match (top boosted={top_boosted:.3f} < 0.85)")
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
def _llm_select_match(
|
|
513
|
+
self,
|
|
514
|
+
query_name: str,
|
|
515
|
+
candidates: list[tuple],
|
|
516
|
+
extracted_role: Optional[str],
|
|
517
|
+
extracted_org: Optional[str],
|
|
518
|
+
context: PipelineContext,
|
|
519
|
+
) -> Optional[tuple]:
|
|
520
|
+
"""Use LLM to select the best match."""
|
|
521
|
+
# Format candidates for prompt
|
|
522
|
+
candidate_lines = []
|
|
523
|
+
for i, (record, similarity, boosted) in enumerate(candidates[:10], 1):
|
|
524
|
+
role_str = f", {record.known_for_role}" if record.known_for_role else ""
|
|
525
|
+
org_str = f" at {record.known_for_org}" if record.known_for_org else ""
|
|
526
|
+
country_str = f", {record.country}" if record.country else ""
|
|
527
|
+
candidate_lines.append(
|
|
528
|
+
f"{i}. {record.name}{role_str}{org_str}{country_str} (score: {boosted:.2f})"
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Build context info from extracted role/org
|
|
532
|
+
context_parts = []
|
|
533
|
+
if extracted_role:
|
|
534
|
+
context_parts.append(f"role={extracted_role}")
|
|
535
|
+
if extracted_org:
|
|
536
|
+
context_parts.append(f"org={extracted_org}")
|
|
537
|
+
context_info = ", ".join(context_parts) if context_parts else "no role/org extracted"
|
|
538
|
+
|
|
539
|
+
# Source text preview
|
|
540
|
+
source_preview = ""
|
|
541
|
+
if context.source_text:
|
|
542
|
+
source_preview = context.source_text[:300] + "..." if len(context.source_text) > 300 else context.source_text
|
|
543
|
+
|
|
544
|
+
prompt = PERSON_MATCH_PROMPT.format(
|
|
545
|
+
query_name=query_name,
|
|
546
|
+
context_info=context_info,
|
|
547
|
+
source_preview=source_preview,
|
|
548
|
+
candidates="\n".join(candidate_lines),
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Get LLM response
|
|
552
|
+
assert self._llm is not None
|
|
553
|
+
response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
|
|
554
|
+
response = response.strip()
|
|
555
|
+
|
|
556
|
+
logger.info(f" LLM response for '{query_name}': {response}")
|
|
557
|
+
|
|
558
|
+
# Parse response
|
|
559
|
+
if response.upper() == "NONE":
|
|
560
|
+
logger.info(f" LLM chose: NONE (no match)")
|
|
561
|
+
return None
|
|
562
|
+
|
|
563
|
+
try:
|
|
564
|
+
idx = int(response) - 1
|
|
565
|
+
if 0 <= idx < len(candidates):
|
|
566
|
+
chosen = candidates[idx]
|
|
567
|
+
logger.info(f" LLM chose: #{idx + 1} '{chosen[0].name}' (boosted={chosen[2]:.3f})")
|
|
568
|
+
return chosen
|
|
569
|
+
except ValueError:
|
|
570
|
+
logger.warning(f" LLM response '{response}' could not be parsed as number")
|
|
571
|
+
|
|
572
|
+
# Fallback to top match if LLM response is unclear and score is decent
|
|
573
|
+
if candidates[0][2] >= 0.8:
|
|
574
|
+
logger.info(f" Fallback to top match: '{candidates[0][0].name}' (boosted={candidates[0][2]:.3f})")
|
|
575
|
+
return candidates[0]
|
|
576
|
+
|
|
577
|
+
logger.info(f" No confident match (top boosted={candidates[0][2]:.3f} < 0.8)")
|
|
578
|
+
return None
|
|
124
579
|
|
|
125
580
|
def _extract_with_llm(
|
|
126
581
|
self,
|
|
@@ -216,6 +671,115 @@ Should return:
|
|
|
216
671
|
|
|
217
672
|
return None
|
|
218
673
|
|
|
674
|
+
def _build_canonical_entity(
|
|
675
|
+
self,
|
|
676
|
+
entity: ExtractedEntity,
|
|
677
|
+
qualifiers: Optional[EntityQualifiers],
|
|
678
|
+
canonical_match: Optional[CanonicalMatch],
|
|
679
|
+
) -> CanonicalEntity:
|
|
680
|
+
"""Build CanonicalEntity from qualifiers and optional canonical match."""
|
|
681
|
+
# Ensure qualifiers is not None
|
|
682
|
+
if qualifiers is None:
|
|
683
|
+
qualifiers = EntityQualifiers()
|
|
684
|
+
|
|
685
|
+
# If we have a canonical match, add wikidata ID to identifiers
|
|
686
|
+
identifiers: dict[str, str] = dict(qualifiers.identifiers) if qualifiers.identifiers else {}
|
|
687
|
+
resolved_role: Optional[ResolvedRole] = None
|
|
688
|
+
resolved_org: Optional[ResolvedOrganization] = None
|
|
689
|
+
|
|
690
|
+
if canonical_match:
|
|
691
|
+
match_details = canonical_match.match_details or {}
|
|
692
|
+
source_id = str(match_details.get("source_id", ""))
|
|
693
|
+
if source_id:
|
|
694
|
+
identifiers["wikidata_id"] = source_id
|
|
695
|
+
if canonical_match.canonical_id:
|
|
696
|
+
identifiers["canonical_id"] = canonical_match.canonical_id
|
|
697
|
+
|
|
698
|
+
# Extract role and org from database match
|
|
699
|
+
known_role = str(match_details.get("known_for_role", "") or "")
|
|
700
|
+
known_org = str(match_details.get("known_for_org", "") or "")
|
|
701
|
+
|
|
702
|
+
# Create ResolvedRole from database match
|
|
703
|
+
if known_role:
|
|
704
|
+
resolved_role = ResolvedRole(
|
|
705
|
+
canonical_name=known_role,
|
|
706
|
+
canonical_id=None, # Role ID would need separate lookup
|
|
707
|
+
source="wikidata",
|
|
708
|
+
source_id=source_id if source_id else None,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
# Update qualifiers with info from database if not already set
|
|
712
|
+
if not qualifiers.role and known_role:
|
|
713
|
+
final_role = known_role
|
|
714
|
+
final_org = qualifiers.org or known_org or None
|
|
715
|
+
else:
|
|
716
|
+
final_role = qualifiers.role
|
|
717
|
+
final_org = qualifiers.org
|
|
718
|
+
else:
|
|
719
|
+
final_role = qualifiers.role
|
|
720
|
+
final_org = qualifiers.org
|
|
721
|
+
|
|
722
|
+
# Resolve organization against the organization database
|
|
723
|
+
org_to_resolve = final_org
|
|
724
|
+
if org_to_resolve:
|
|
725
|
+
logger.debug(f" Resolving organization: '{org_to_resolve}'")
|
|
726
|
+
resolved_org = self._resolve_organization(org_to_resolve)
|
|
727
|
+
if resolved_org:
|
|
728
|
+
logger.info(f" Resolved org: '{org_to_resolve}' -> '{resolved_org.canonical_name}' ({resolved_org.canonical_id})")
|
|
729
|
+
|
|
730
|
+
# Build the final qualifiers with resolved info
|
|
731
|
+
qualifiers = EntityQualifiers(
|
|
732
|
+
role=final_role,
|
|
733
|
+
org=final_org,
|
|
734
|
+
identifiers=identifiers,
|
|
735
|
+
resolved_role=resolved_role,
|
|
736
|
+
resolved_org=resolved_org,
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
# Create QualifiedEntity
|
|
740
|
+
qualified = QualifiedEntity(
|
|
741
|
+
entity_ref=entity.entity_ref,
|
|
742
|
+
original_text=entity.text,
|
|
743
|
+
entity_type=entity.type,
|
|
744
|
+
qualifiers=qualifiers,
|
|
745
|
+
qualification_sources=[self.name],
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
# Build FQN - prefer resolved names when available
|
|
749
|
+
if canonical_match and canonical_match.canonical_name:
|
|
750
|
+
# Use canonical person name from database
|
|
751
|
+
fqn_parts: list[str] = [canonical_match.canonical_name]
|
|
752
|
+
if qualifiers.role:
|
|
753
|
+
fqn_parts.append(f"({qualifiers.role})")
|
|
754
|
+
# Use resolved org name if available
|
|
755
|
+
if resolved_org:
|
|
756
|
+
fqn_parts.append(f"at {resolved_org.canonical_name}")
|
|
757
|
+
elif qualifiers.org:
|
|
758
|
+
fqn_parts.append(f"at {qualifiers.org}")
|
|
759
|
+
fqn = " ".join(fqn_parts)
|
|
760
|
+
else:
|
|
761
|
+
# Build FQN: "Person Name (Role, Org)" or "Person Name (Role)" or "Person Name (Org)"
|
|
762
|
+
fqn_parts_for_display: list[str] = []
|
|
763
|
+
if qualifiers.role:
|
|
764
|
+
fqn_parts_for_display.append(qualifiers.role)
|
|
765
|
+
# Use resolved org name if available
|
|
766
|
+
if resolved_org:
|
|
767
|
+
fqn_parts_for_display.append(resolved_org.canonical_name)
|
|
768
|
+
elif qualifiers.org:
|
|
769
|
+
fqn_parts_for_display.append(qualifiers.org)
|
|
770
|
+
|
|
771
|
+
if fqn_parts_for_display:
|
|
772
|
+
fqn = f"{entity.text} ({', '.join(fqn_parts_for_display)})"
|
|
773
|
+
else:
|
|
774
|
+
fqn = entity.text
|
|
775
|
+
|
|
776
|
+
return CanonicalEntity(
|
|
777
|
+
entity_ref=entity.entity_ref,
|
|
778
|
+
qualified_entity=qualified,
|
|
779
|
+
canonical_match=canonical_match,
|
|
780
|
+
fqn=fqn,
|
|
781
|
+
)
|
|
782
|
+
|
|
219
783
|
|
|
220
784
|
# Allow importing without decorator for testing
|
|
221
785
|
PersonQualifierPluginClass = PersonQualifierPlugin
|