corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,6 @@ Then searches the person database to find canonical matches for notable people
9
9
  (those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
10
10
  """
11
11
 
12
- import json
13
12
  import logging
14
13
  import re
15
14
  from typing import Optional
@@ -44,11 +43,12 @@ Candidates from database (with Wikipedia info):
44
43
 
45
44
  Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
46
45
 
47
- Rules:
48
- - The match should refer to the same person
49
- - Consider whether the role and organization from the text match the Wikipedia info
50
- - Different people with similar names should NOT match
51
- - If the extracted name is too generic or ambiguous, respond "NONE"
46
+ IMPORTANT RULES:
47
+ 1. The candidate name must closely match the extracted name "{query_name}"
48
+ 2. Similar-sounding names are NOT matches (e.g., "Andy Vassies" does NOT match "Andy Jassy")
49
+ 3. If no candidate has a name that matches "{query_name}", respond "NONE"
50
+ 4. Consider role and organization context only AFTER confirming name match
51
+ 5. When in doubt, prefer "NONE" over a wrong match
52
52
 
53
53
  Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
54
54
  """
@@ -260,7 +260,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
260
260
  if result and (result.role or result.org):
261
261
  qualifiers = result
262
262
 
263
- # Fallback to pattern matching
263
+ # Fallback to pattern matching (only if LLM extraction returned nothing)
264
264
  if qualifiers is None:
265
265
  qualifiers = self._extract_with_patterns(entity.text, full_text)
266
266
 
@@ -313,42 +313,79 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
313
313
  if embedder is None:
314
314
  return None
315
315
 
316
- # Embed the person name
317
- logger.debug(f" Embedding person name: '{person_name}'")
318
- query_embedding = embedder.embed(person_name)
316
+ # Log extracted context
317
+ logger.debug(f" Person search context:")
318
+ logger.debug(f" Name: '{person_name}'")
319
+ logger.debug(f" Extracted role: {extracted_role or '(none)'}")
320
+ logger.debug(f" Extracted org: {extracted_org or '(none)'}")
319
321
 
320
- # Search database with text pre-filtering
322
+ # Build query text with context for better embedding match
323
+ # This matches how PersonRecord.get_embedding_text() builds embedding text
324
+ query_parts = [person_name]
325
+ if extracted_role:
326
+ query_parts.append(extracted_role)
327
+ if extracted_org:
328
+ query_parts.append(extracted_org)
329
+ query_text = " | ".join(query_parts)
330
+
331
+ logger.debug(f" Embedding query: '{query_text}'")
332
+ query_embedding = embedder.embed(query_text)
333
+
334
+ # Search database with text pre-filtering on name only
321
335
  logger.debug(f" Searching person database...")
322
336
  results = database.search(
323
337
  query_embedding,
324
- top_k=self._top_k,
338
+ top_k=self._top_k * 3, # Fetch more to allow for org filtering
325
339
  query_text=person_name,
326
340
  )
327
341
 
342
+ logger.debug(f" Database returned {len(results)} raw results")
343
+
344
+ # If org was extracted, boost candidates that match the org
345
+ if extracted_org:
346
+ # Re-score with org preference
347
+ org_matched = []
348
+ org_unmatched = []
349
+ for record, sim in results:
350
+ if record.known_for_org and self._org_matches(extracted_org, record.known_for_org):
351
+ logger.debug(f" Org match: {record.name} at {record.known_for_org}")
352
+ org_matched.append((record, sim))
353
+ else:
354
+ org_unmatched.append((record, sim))
355
+ # Prioritize org matches
356
+ if org_matched:
357
+ logger.info(f" Found {len(org_matched)} candidates matching org '{extracted_org}'")
358
+ results = org_matched + org_unmatched
359
+ else:
360
+ logger.debug(f" No candidates match org '{extracted_org}'")
361
+
328
362
  # Filter by minimum similarity
329
363
  results = [(r, s) for r, s in results if s >= self._min_similarity]
364
+ logger.debug(f" After min_similarity filter ({self._min_similarity}): {len(results)} results")
330
365
 
331
366
  if not results:
332
367
  logger.debug(f" No person matches found above threshold {self._min_similarity}")
333
368
  return None
334
369
 
335
- # Boost scores based on role/org matching
370
+ # Boost scores based on name/role/org matching
336
371
  scored_results = []
337
372
  for record, similarity in results:
338
373
  boosted_score = self._compute_match_score(
339
- record, similarity, extracted_role, extracted_org
374
+ record, similarity, extracted_role, extracted_org, query_name=person_name
340
375
  )
341
376
  scored_results.append((record, similarity, boosted_score))
342
377
 
343
378
  # Sort by boosted score
344
379
  scored_results.sort(key=lambda x: x[2], reverse=True)
345
380
 
346
- # Log top candidates
381
+ # Log top candidates with detailed context
347
382
  logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
348
383
  for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
349
384
  role_str = f" ({record.known_for_role})" if record.known_for_role else ""
350
385
  org_str = f" at {record.known_for_org}" if record.known_for_org else ""
351
- logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
386
+ boost_delta = boosted - sim
387
+ boost_info = f" [+{boost_delta:.3f} boost]" if boost_delta > 0 else ""
388
+ logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f}{boost_info})")
352
389
 
353
390
  # Select best match using LLM if available
354
391
  logger.info(f" Selecting best match (LLM={self._llm is not None})...")
@@ -385,6 +422,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
385
422
  embedding_similarity: float,
386
423
  extracted_role: Optional[str],
387
424
  extracted_org: Optional[str],
425
+ query_name: Optional[str] = None,
388
426
  ) -> float:
389
427
  """
390
428
  Compute boosted match score using role/org context.
@@ -393,6 +431,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
393
431
  """
394
432
  score = embedding_similarity
395
433
 
434
+ # Major boost for exact name match (normalized)
435
+ if query_name:
436
+ query_norm = self._normalize_person_name(query_name)
437
+ record_norm = self._normalize_person_name(record.name)
438
+ if query_norm == record_norm:
439
+ score += 0.25 # +25% boost for exact name match
440
+ logger.debug(f" Exact name match boost: '{query_name}' == '{record.name}'")
441
+
396
442
  # Boost if role matches (fuzzy)
397
443
  if extracted_role and record.known_for_role:
398
444
  if self._role_matches(extracted_role, record.known_for_role):
@@ -458,6 +504,18 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
458
504
 
459
505
  return False
460
506
 
507
+ def _normalize_person_name(self, name: str) -> str:
508
+ """Normalize person name for comparison."""
509
+ # Lowercase and strip
510
+ normalized = name.lower().strip()
511
+ # Remove common titles
512
+ for title in ["dr.", "dr ", "mr.", "mr ", "mrs.", "mrs ", "ms.", "ms ", "prof.", "prof "]:
513
+ if normalized.startswith(title):
514
+ normalized = normalized[len(title):]
515
+ # Remove extra whitespace
516
+ normalized = " ".join(normalized.split())
517
+ return normalized
518
+
461
519
  def _normalize_org_name(self, name: str) -> str:
462
520
  """Simple org name normalization."""
463
521
  # Lowercase
@@ -592,52 +650,51 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
592
650
  person_name: str,
593
651
  context_text: str,
594
652
  ) -> Optional[EntityQualifiers]:
595
- """Extract role and org using Gemma3."""
653
+ """Extract role and org using Gemma3 with simple line-based output."""
596
654
  if self._llm is None:
597
655
  return None
598
656
 
599
657
  try:
600
- prompt = f"""Extract qualifiers for a person from the given context.
601
- Instructions:
602
- - "role" = job title or position (e.g., "CEO", "President", "Director")
603
- - "org" = company or organization name (e.g., "Amazon", "Apple Inc", "Microsoft")
604
- - These are DIFFERENT things: role is a job title, org is a company name
605
- - Return null for fields not mentioned in the context
606
-
607
- Return ONLY valid JSON:
608
-
609
- E.g.
610
- <context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
611
- <person>James</person>
612
-
613
- Should return:
658
+ prompt = f"""Extract info about "{person_name}" from the text below.
659
+ Reply with exactly 3 lines:
660
+ NAME: the person's full name
661
+ ROLE: their job title (CEO, President, etc.) or NONE
662
+ ORG: the company/organization name or NONE
614
663
 
615
- {{"role": "CEO", "org": "Big Ducks Quacking Inc"}}
664
+ Text: {context_text[:500]}
616
665
 
617
- ---
666
+ NAME:"""
618
667
 
619
- <context>{context_text}</context>
620
- <person>{person_name}</person>
621
- """
622
-
623
- logger.debug(f"LLM request: {prompt}")
668
+ logger.debug(f"LLM extraction prompt for '{person_name}'")
624
669
  response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
625
670
  logger.debug(f"LLM response: {response}")
626
671
 
627
- # Extract JSON from response
628
- json_match = re.search(r'\{[^}]+\}', response)
629
- if json_match:
630
- data = json.loads(json_match.group())
631
- role = data.get("role")
632
- org = data.get("org")
633
-
634
- # Validate: role and org should be different (reject if same)
635
- if role and org and role.lower() == org.lower():
636
- logger.debug(f"Rejected duplicate role/org: {role}")
637
- org = None # Clear org if it's same as role
638
-
639
- if role or org:
640
- return EntityQualifiers(role=role, org=org)
672
+ # Parse line-based response
673
+ lines = response.strip().split("\n")
674
+ name = None
675
+ role = None
676
+ org = None
677
+
678
+ for line in lines:
679
+ line = line.strip()
680
+ if line.startswith("NAME:"):
681
+ name = line[5:].strip()
682
+ elif line.startswith("ROLE:"):
683
+ val = line[5:].strip()
684
+ if val.upper() != "NONE":
685
+ role = val
686
+ elif line.startswith("ORG:"):
687
+ val = line[4:].strip()
688
+ if val.upper() != "NONE":
689
+ org = val
690
+ # Handle case where first line is just the name (after our "NAME:" in prompt)
691
+ elif not name and line and not line.startswith(("ROLE", "ORG")):
692
+ name = line
693
+
694
+ logger.debug(f"LLM extracted: name={name!r}, role={role!r}, org={org!r}")
695
+
696
+ if role or org:
697
+ return EntityQualifiers(role=role, org=org)
641
698
 
642
699
  except Exception as e:
643
700
  logger.exception(f"LLM extraction failed: {e}")