corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
- statement_extractor/cli.py +472 -45
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +51 -9
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/wikidata_dump.py +334 -3
- statement_extractor/database/importers/wikidata_people.py +44 -0
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +125 -1
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +2113 -322
- statement_extractor/plugins/qualifiers/person.py +109 -52
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -9,7 +9,6 @@ Then searches the person database to find canonical matches for notable people
|
|
|
9
9
|
(those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
import json
|
|
13
12
|
import logging
|
|
14
13
|
import re
|
|
15
14
|
from typing import Optional
|
|
@@ -44,11 +43,12 @@ Candidates from database (with Wikipedia info):
|
|
|
44
43
|
|
|
45
44
|
Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
|
|
46
45
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
-
|
|
50
|
-
|
|
51
|
-
|
|
46
|
+
IMPORTANT RULES:
|
|
47
|
+
1. The candidate name must closely match the extracted name "{query_name}"
|
|
48
|
+
2. Similar-sounding names are NOT matches (e.g., "Andy Vassies" does NOT match "Andy Jassy")
|
|
49
|
+
3. If no candidate has a name that matches "{query_name}", respond "NONE"
|
|
50
|
+
4. Consider role and organization context only AFTER confirming name match
|
|
51
|
+
5. When in doubt, prefer "NONE" over a wrong match
|
|
52
52
|
|
|
53
53
|
Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
|
|
54
54
|
"""
|
|
@@ -260,7 +260,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
260
260
|
if result and (result.role or result.org):
|
|
261
261
|
qualifiers = result
|
|
262
262
|
|
|
263
|
-
# Fallback to pattern matching
|
|
263
|
+
# Fallback to pattern matching (only if LLM extraction returned nothing)
|
|
264
264
|
if qualifiers is None:
|
|
265
265
|
qualifiers = self._extract_with_patterns(entity.text, full_text)
|
|
266
266
|
|
|
@@ -313,42 +313,79 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
313
313
|
if embedder is None:
|
|
314
314
|
return None
|
|
315
315
|
|
|
316
|
-
#
|
|
317
|
-
logger.debug(f"
|
|
318
|
-
|
|
316
|
+
# Log extracted context
|
|
317
|
+
logger.debug(f" Person search context:")
|
|
318
|
+
logger.debug(f" Name: '{person_name}'")
|
|
319
|
+
logger.debug(f" Extracted role: {extracted_role or '(none)'}")
|
|
320
|
+
logger.debug(f" Extracted org: {extracted_org or '(none)'}")
|
|
319
321
|
|
|
320
|
-
#
|
|
322
|
+
# Build query text with context for better embedding match
|
|
323
|
+
# This matches how PersonRecord.get_embedding_text() builds embedding text
|
|
324
|
+
query_parts = [person_name]
|
|
325
|
+
if extracted_role:
|
|
326
|
+
query_parts.append(extracted_role)
|
|
327
|
+
if extracted_org:
|
|
328
|
+
query_parts.append(extracted_org)
|
|
329
|
+
query_text = " | ".join(query_parts)
|
|
330
|
+
|
|
331
|
+
logger.debug(f" Embedding query: '{query_text}'")
|
|
332
|
+
query_embedding = embedder.embed(query_text)
|
|
333
|
+
|
|
334
|
+
# Search database with text pre-filtering on name only
|
|
321
335
|
logger.debug(f" Searching person database...")
|
|
322
336
|
results = database.search(
|
|
323
337
|
query_embedding,
|
|
324
|
-
top_k=self._top_k,
|
|
338
|
+
top_k=self._top_k * 3, # Fetch more to allow for org filtering
|
|
325
339
|
query_text=person_name,
|
|
326
340
|
)
|
|
327
341
|
|
|
342
|
+
logger.debug(f" Database returned {len(results)} raw results")
|
|
343
|
+
|
|
344
|
+
# If org was extracted, boost candidates that match the org
|
|
345
|
+
if extracted_org:
|
|
346
|
+
# Re-score with org preference
|
|
347
|
+
org_matched = []
|
|
348
|
+
org_unmatched = []
|
|
349
|
+
for record, sim in results:
|
|
350
|
+
if record.known_for_org and self._org_matches(extracted_org, record.known_for_org):
|
|
351
|
+
logger.debug(f" Org match: {record.name} at {record.known_for_org}")
|
|
352
|
+
org_matched.append((record, sim))
|
|
353
|
+
else:
|
|
354
|
+
org_unmatched.append((record, sim))
|
|
355
|
+
# Prioritize org matches
|
|
356
|
+
if org_matched:
|
|
357
|
+
logger.info(f" Found {len(org_matched)} candidates matching org '{extracted_org}'")
|
|
358
|
+
results = org_matched + org_unmatched
|
|
359
|
+
else:
|
|
360
|
+
logger.debug(f" No candidates match org '{extracted_org}'")
|
|
361
|
+
|
|
328
362
|
# Filter by minimum similarity
|
|
329
363
|
results = [(r, s) for r, s in results if s >= self._min_similarity]
|
|
364
|
+
logger.debug(f" After min_similarity filter ({self._min_similarity}): {len(results)} results")
|
|
330
365
|
|
|
331
366
|
if not results:
|
|
332
367
|
logger.debug(f" No person matches found above threshold {self._min_similarity}")
|
|
333
368
|
return None
|
|
334
369
|
|
|
335
|
-
# Boost scores based on role/org matching
|
|
370
|
+
# Boost scores based on name/role/org matching
|
|
336
371
|
scored_results = []
|
|
337
372
|
for record, similarity in results:
|
|
338
373
|
boosted_score = self._compute_match_score(
|
|
339
|
-
record, similarity, extracted_role, extracted_org
|
|
374
|
+
record, similarity, extracted_role, extracted_org, query_name=person_name
|
|
340
375
|
)
|
|
341
376
|
scored_results.append((record, similarity, boosted_score))
|
|
342
377
|
|
|
343
378
|
# Sort by boosted score
|
|
344
379
|
scored_results.sort(key=lambda x: x[2], reverse=True)
|
|
345
380
|
|
|
346
|
-
# Log top candidates
|
|
381
|
+
# Log top candidates with detailed context
|
|
347
382
|
logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
|
|
348
383
|
for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
|
|
349
384
|
role_str = f" ({record.known_for_role})" if record.known_for_role else ""
|
|
350
385
|
org_str = f" at {record.known_for_org}" if record.known_for_org else ""
|
|
351
|
-
|
|
386
|
+
boost_delta = boosted - sim
|
|
387
|
+
boost_info = f" [+{boost_delta:.3f} boost]" if boost_delta > 0 else ""
|
|
388
|
+
logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f}{boost_info})")
|
|
352
389
|
|
|
353
390
|
# Select best match using LLM if available
|
|
354
391
|
logger.info(f" Selecting best match (LLM={self._llm is not None})...")
|
|
@@ -385,6 +422,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
385
422
|
embedding_similarity: float,
|
|
386
423
|
extracted_role: Optional[str],
|
|
387
424
|
extracted_org: Optional[str],
|
|
425
|
+
query_name: Optional[str] = None,
|
|
388
426
|
) -> float:
|
|
389
427
|
"""
|
|
390
428
|
Compute boosted match score using role/org context.
|
|
@@ -393,6 +431,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
393
431
|
"""
|
|
394
432
|
score = embedding_similarity
|
|
395
433
|
|
|
434
|
+
# Major boost for exact name match (normalized)
|
|
435
|
+
if query_name:
|
|
436
|
+
query_norm = self._normalize_person_name(query_name)
|
|
437
|
+
record_norm = self._normalize_person_name(record.name)
|
|
438
|
+
if query_norm == record_norm:
|
|
439
|
+
score += 0.25 # +25% boost for exact name match
|
|
440
|
+
logger.debug(f" Exact name match boost: '{query_name}' == '{record.name}'")
|
|
441
|
+
|
|
396
442
|
# Boost if role matches (fuzzy)
|
|
397
443
|
if extracted_role and record.known_for_role:
|
|
398
444
|
if self._role_matches(extracted_role, record.known_for_role):
|
|
@@ -458,6 +504,18 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
458
504
|
|
|
459
505
|
return False
|
|
460
506
|
|
|
507
|
+
def _normalize_person_name(self, name: str) -> str:
|
|
508
|
+
"""Normalize person name for comparison."""
|
|
509
|
+
# Lowercase and strip
|
|
510
|
+
normalized = name.lower().strip()
|
|
511
|
+
# Remove common titles
|
|
512
|
+
for title in ["dr.", "dr ", "mr.", "mr ", "mrs.", "mrs ", "ms.", "ms ", "prof.", "prof "]:
|
|
513
|
+
if normalized.startswith(title):
|
|
514
|
+
normalized = normalized[len(title):]
|
|
515
|
+
# Remove extra whitespace
|
|
516
|
+
normalized = " ".join(normalized.split())
|
|
517
|
+
return normalized
|
|
518
|
+
|
|
461
519
|
def _normalize_org_name(self, name: str) -> str:
|
|
462
520
|
"""Simple org name normalization."""
|
|
463
521
|
# Lowercase
|
|
@@ -592,52 +650,51 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
592
650
|
person_name: str,
|
|
593
651
|
context_text: str,
|
|
594
652
|
) -> Optional[EntityQualifiers]:
|
|
595
|
-
"""Extract role and org using Gemma3."""
|
|
653
|
+
"""Extract role and org using Gemma3 with simple line-based output."""
|
|
596
654
|
if self._llm is None:
|
|
597
655
|
return None
|
|
598
656
|
|
|
599
657
|
try:
|
|
600
|
-
prompt = f"""Extract
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
- Return null for fields not mentioned in the context
|
|
606
|
-
|
|
607
|
-
Return ONLY valid JSON:
|
|
608
|
-
|
|
609
|
-
E.g.
|
|
610
|
-
<context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
|
|
611
|
-
<person>James</person>
|
|
612
|
-
|
|
613
|
-
Should return:
|
|
658
|
+
prompt = f"""Extract info about "{person_name}" from the text below.
|
|
659
|
+
Reply with exactly 3 lines:
|
|
660
|
+
NAME: the person's full name
|
|
661
|
+
ROLE: their job title (CEO, President, etc.) or NONE
|
|
662
|
+
ORG: the company/organization name or NONE
|
|
614
663
|
|
|
615
|
-
|
|
664
|
+
Text: {context_text[:500]}
|
|
616
665
|
|
|
617
|
-
|
|
666
|
+
NAME:"""
|
|
618
667
|
|
|
619
|
-
|
|
620
|
-
<person>{person_name}</person>
|
|
621
|
-
"""
|
|
622
|
-
|
|
623
|
-
logger.debug(f"LLM request: {prompt}")
|
|
668
|
+
logger.debug(f"LLM extraction prompt for '{person_name}'")
|
|
624
669
|
response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
|
|
625
670
|
logger.debug(f"LLM response: {response}")
|
|
626
671
|
|
|
627
|
-
#
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
if
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
672
|
+
# Parse line-based response
|
|
673
|
+
lines = response.strip().split("\n")
|
|
674
|
+
name = None
|
|
675
|
+
role = None
|
|
676
|
+
org = None
|
|
677
|
+
|
|
678
|
+
for line in lines:
|
|
679
|
+
line = line.strip()
|
|
680
|
+
if line.startswith("NAME:"):
|
|
681
|
+
name = line[5:].strip()
|
|
682
|
+
elif line.startswith("ROLE:"):
|
|
683
|
+
val = line[5:].strip()
|
|
684
|
+
if val.upper() != "NONE":
|
|
685
|
+
role = val
|
|
686
|
+
elif line.startswith("ORG:"):
|
|
687
|
+
val = line[4:].strip()
|
|
688
|
+
if val.upper() != "NONE":
|
|
689
|
+
org = val
|
|
690
|
+
# Handle case where first line is just the name (after our "NAME:" in prompt)
|
|
691
|
+
elif not name and line and not line.startswith(("ROLE", "ORG")):
|
|
692
|
+
name = line
|
|
693
|
+
|
|
694
|
+
logger.debug(f"LLM extracted: name={name!r}, role={role!r}, org={org!r}")
|
|
695
|
+
|
|
696
|
+
if role or org:
|
|
697
|
+
return EntityQualifiers(role=role, org=org)
|
|
641
698
|
|
|
642
699
|
except Exception as e:
|
|
643
700
|
logger.exception(f"LLM extraction failed: {e}")
|
|
File without changes
|
|
File without changes
|