corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,6 @@ Then searches the person database to find canonical matches for notable people
9
9
  (those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
10
10
  """
11
11
 
12
- import json
13
12
  import logging
14
13
  import re
15
14
  from typing import Optional
@@ -44,11 +43,12 @@ Candidates from database (with Wikipedia info):
44
43
 
45
44
  Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
46
45
 
47
- Rules:
48
- - The match should refer to the same person
49
- - Consider whether the role and organization from the text match the Wikipedia info
50
- - Different people with similar names should NOT match
51
- - If the extracted name is too generic or ambiguous, respond "NONE"
46
+ IMPORTANT RULES:
47
+ 1. The candidate name must closely match the extracted name "{query_name}"
48
+ 2. Similar-sounding names are NOT matches (e.g., "Andy Vassies" does NOT match "Andy Jassy")
49
+ 3. If no candidate has a name that matches "{query_name}", respond "NONE"
50
+ 4. Consider role and organization context only AFTER confirming name match
51
+ 5. When in doubt, prefer "NONE" over a wrong match
52
52
 
53
53
  Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
54
54
  """
@@ -260,7 +260,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
260
260
  if result and (result.role or result.org):
261
261
  qualifiers = result
262
262
 
263
- # Fallback to pattern matching
263
+ # Fallback to pattern matching (only if LLM extraction returned nothing)
264
264
  if qualifiers is None:
265
265
  qualifiers = self._extract_with_patterns(entity.text, full_text)
266
266
 
@@ -313,42 +313,79 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
313
313
  if embedder is None:
314
314
  return None
315
315
 
316
- # Embed the person name
317
- logger.debug(f" Embedding person name: '{person_name}'")
318
- query_embedding = embedder.embed(person_name)
316
+ # Log extracted context
317
+ logger.debug(f" Person search context:")
318
+ logger.debug(f" Name: '{person_name}'")
319
+ logger.debug(f" Extracted role: {extracted_role or '(none)'}")
320
+ logger.debug(f" Extracted org: {extracted_org or '(none)'}")
319
321
 
320
- # Search database with text pre-filtering
322
+ # Build query text with context for better embedding match
323
+ # This matches how PersonRecord.get_embedding_text() builds embedding text
324
+ query_parts = [person_name]
325
+ if extracted_role:
326
+ query_parts.append(extracted_role)
327
+ if extracted_org:
328
+ query_parts.append(extracted_org)
329
+ query_text = " | ".join(query_parts)
330
+
331
+ logger.debug(f" Embedding query: '{query_text}'")
332
+ query_embedding = embedder.embed(query_text)
333
+
334
+ # Search database with text pre-filtering on name only
321
335
  logger.debug(f" Searching person database...")
322
336
  results = database.search(
323
337
  query_embedding,
324
- top_k=self._top_k,
338
+ top_k=self._top_k * 3, # Fetch more to allow for org filtering
325
339
  query_text=person_name,
326
340
  )
327
341
 
342
+ logger.debug(f" Database returned {len(results)} raw results")
343
+
344
+ # If org was extracted, boost candidates that match the org
345
+ if extracted_org:
346
+ # Re-score with org preference
347
+ org_matched = []
348
+ org_unmatched = []
349
+ for record, sim in results:
350
+ if record.known_for_org and self._org_matches(extracted_org, record.known_for_org):
351
+ logger.debug(f" Org match: {record.name} at {record.known_for_org}")
352
+ org_matched.append((record, sim))
353
+ else:
354
+ org_unmatched.append((record, sim))
355
+ # Prioritize org matches
356
+ if org_matched:
357
+ logger.info(f" Found {len(org_matched)} candidates matching org '{extracted_org}'")
358
+ results = org_matched + org_unmatched
359
+ else:
360
+ logger.debug(f" No candidates match org '{extracted_org}'")
361
+
328
362
  # Filter by minimum similarity
329
363
  results = [(r, s) for r, s in results if s >= self._min_similarity]
364
+ logger.debug(f" After min_similarity filter ({self._min_similarity}): {len(results)} results")
330
365
 
331
366
  if not results:
332
367
  logger.debug(f" No person matches found above threshold {self._min_similarity}")
333
368
  return None
334
369
 
335
- # Boost scores based on role/org matching
370
+ # Boost scores based on name/role/org matching
336
371
  scored_results = []
337
372
  for record, similarity in results:
338
373
  boosted_score = self._compute_match_score(
339
- record, similarity, extracted_role, extracted_org
374
+ record, similarity, extracted_role, extracted_org, query_name=person_name
340
375
  )
341
376
  scored_results.append((record, similarity, boosted_score))
342
377
 
343
378
  # Sort by boosted score
344
379
  scored_results.sort(key=lambda x: x[2], reverse=True)
345
380
 
346
- # Log top candidates
381
+ # Log top candidates with detailed context
347
382
  logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
348
383
  for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
349
384
  role_str = f" ({record.known_for_role})" if record.known_for_role else ""
350
385
  org_str = f" at {record.known_for_org}" if record.known_for_org else ""
351
- logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f})")
386
+ boost_delta = boosted - sim
387
+ boost_info = f" [+{boost_delta:.3f} boost]" if boost_delta > 0 else ""
388
+ logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f}{boost_info})")
352
389
 
353
390
  # Select best match using LLM if available
354
391
  logger.info(f" Selecting best match (LLM={self._llm is not None})...")
@@ -373,6 +410,9 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
373
410
  "similarity": similarity,
374
411
  "known_for_role": record.known_for_role,
375
412
  "known_for_org": record.known_for_org,
413
+ "birth_date": record.birth_date,
414
+ "death_date": record.death_date,
415
+ "is_historic": record.is_historic,
376
416
  },
377
417
  )
378
418
 
@@ -382,6 +422,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
382
422
  embedding_similarity: float,
383
423
  extracted_role: Optional[str],
384
424
  extracted_org: Optional[str],
425
+ query_name: Optional[str] = None,
385
426
  ) -> float:
386
427
  """
387
428
  Compute boosted match score using role/org context.
@@ -390,6 +431,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
390
431
  """
391
432
  score = embedding_similarity
392
433
 
434
+ # Major boost for exact name match (normalized)
435
+ if query_name:
436
+ query_norm = self._normalize_person_name(query_name)
437
+ record_norm = self._normalize_person_name(record.name)
438
+ if query_norm == record_norm:
439
+ score += 0.25 # +25% boost for exact name match
440
+ logger.debug(f" Exact name match boost: '{query_name}' == '{record.name}'")
441
+
393
442
  # Boost if role matches (fuzzy)
394
443
  if extracted_role and record.known_for_role:
395
444
  if self._role_matches(extracted_role, record.known_for_role):
@@ -455,6 +504,18 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
455
504
 
456
505
  return False
457
506
 
507
+ def _normalize_person_name(self, name: str) -> str:
508
+ """Normalize person name for comparison."""
509
+ # Lowercase and strip
510
+ normalized = name.lower().strip()
511
+ # Remove common titles
512
+ for title in ["dr.", "dr ", "mr.", "mr ", "mrs.", "mrs ", "ms.", "ms ", "prof.", "prof "]:
513
+ if normalized.startswith(title):
514
+ normalized = normalized[len(title):]
515
+ # Remove extra whitespace
516
+ normalized = " ".join(normalized.split())
517
+ return normalized
518
+
458
519
  def _normalize_org_name(self, name: str) -> str:
459
520
  """Simple org name normalization."""
460
521
  # Lowercase
@@ -524,8 +585,15 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
524
585
  role_str = f", {record.known_for_role}" if record.known_for_role else ""
525
586
  org_str = f" at {record.known_for_org}" if record.known_for_org else ""
526
587
  country_str = f", {record.country}" if record.country else ""
588
+ # Include life dates for context (helps identify historic figures)
589
+ dates_parts = []
590
+ if record.birth_date:
591
+ dates_parts.append(f"b. {record.birth_date[:4]}") # Just year
592
+ if record.death_date:
593
+ dates_parts.append(f"d. {record.death_date[:4]}") # Just year
594
+ dates_str = f" [{' - '.join(dates_parts)}]" if dates_parts else ""
527
595
  candidate_lines.append(
528
- f"{i}. {record.name}{role_str}{org_str}{country_str} (score: {boosted:.2f})"
596
+ f"{i}. {record.name}{role_str}{org_str}{country_str}{dates_str} (score: {boosted:.2f})"
529
597
  )
530
598
 
531
599
  # Build context info from extracted role/org
@@ -582,52 +650,51 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
582
650
  person_name: str,
583
651
  context_text: str,
584
652
  ) -> Optional[EntityQualifiers]:
585
- """Extract role and org using Gemma3."""
653
+ """Extract role and org using Gemma3 with simple line-based output."""
586
654
  if self._llm is None:
587
655
  return None
588
656
 
589
657
  try:
590
- prompt = f"""Extract qualifiers for a person from the given context.
591
- Instructions:
592
- - "role" = job title or position (e.g., "CEO", "President", "Director")
593
- - "org" = company or organization name (e.g., "Amazon", "Apple Inc", "Microsoft")
594
- - These are DIFFERENT things: role is a job title, org is a company name
595
- - Return null for fields not mentioned in the context
596
-
597
- Return ONLY valid JSON:
598
-
599
- E.g.
600
- <context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
601
- <person>James</person>
602
-
603
- Should return:
658
+ prompt = f"""Extract info about "{person_name}" from the text below.
659
+ Reply with exactly 3 lines:
660
+ NAME: the person's full name
661
+ ROLE: their job title (CEO, President, etc.) or NONE
662
+ ORG: the company/organization name or NONE
604
663
 
605
- {{"role": "CEO", "org": "Big Ducks Quacking Inc"}}
664
+ Text: {context_text[:500]}
606
665
 
607
- ---
666
+ NAME:"""
608
667
 
609
- <context>{context_text}</context>
610
- <person>{person_name}</person>
611
- """
612
-
613
- logger.debug(f"LLM request: {prompt}")
668
+ logger.debug(f"LLM extraction prompt for '{person_name}'")
614
669
  response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
615
670
  logger.debug(f"LLM response: {response}")
616
671
 
617
- # Extract JSON from response
618
- json_match = re.search(r'\{[^}]+\}', response)
619
- if json_match:
620
- data = json.loads(json_match.group())
621
- role = data.get("role")
622
- org = data.get("org")
623
-
624
- # Validate: role and org should be different (reject if same)
625
- if role and org and role.lower() == org.lower():
626
- logger.debug(f"Rejected duplicate role/org: {role}")
627
- org = None # Clear org if it's same as role
628
-
629
- if role or org:
630
- return EntityQualifiers(role=role, org=org)
672
+ # Parse line-based response
673
+ lines = response.strip().split("\n")
674
+ name = None
675
+ role = None
676
+ org = None
677
+
678
+ for line in lines:
679
+ line = line.strip()
680
+ if line.startswith("NAME:"):
681
+ name = line[5:].strip()
682
+ elif line.startswith("ROLE:"):
683
+ val = line[5:].strip()
684
+ if val.upper() != "NONE":
685
+ role = val
686
+ elif line.startswith("ORG:"):
687
+ val = line[4:].strip()
688
+ if val.upper() != "NONE":
689
+ org = val
690
+ # Handle case where first line is just the name (after our "NAME:" in prompt)
691
+ elif not name and line and not line.startswith(("ROLE", "ORG")):
692
+ name = line
693
+
694
+ logger.debug(f"LLM extracted: name={name!r}, role={role!r}, org={org!r}")
695
+
696
+ if role or org:
697
+ return EntityQualifiers(role=role, org=org)
631
698
 
632
699
  except Exception as e:
633
700
  logger.exception(f"LLM extraction failed: {e}")
@@ -1,8 +1,8 @@
1
1
  """
2
- T5GemmaSplitter - Stage 1 plugin that wraps the existing StatementExtractor.
2
+ T5GemmaSplitter - Stage 1 plugin that splits text into atomic sentences.
3
3
 
4
- Uses T5-Gemma2 model with Diverse Beam Search to generate high-quality
5
- subject-predicate-object triples from text.
4
+ Uses T5-Gemma2 model with Diverse Beam Search to split unstructured text
5
+ into atomic statements that can be converted to triples in Stage 2.
6
6
  """
7
7
 
8
8
  import logging
@@ -12,7 +12,7 @@ from typing import Optional
12
12
  from ..base import BaseSplitterPlugin, PluginCapability
13
13
  from ...pipeline.context import PipelineContext
14
14
  from ...pipeline.registry import PluginRegistry
15
- from ...models import RawTriple
15
+ from ...models import SplitSentence
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
@@ -20,10 +20,11 @@ logger = logging.getLogger(__name__)
20
20
  @PluginRegistry.splitter
21
21
  class T5GemmaSplitter(BaseSplitterPlugin):
22
22
  """
23
- Splitter plugin that uses T5-Gemma2 for triple extraction.
23
+ Splitter plugin that uses T5-Gemma2 to split text into atomic sentences.
24
24
 
25
- Wraps the existing StatementExtractor from extractor.py to produce
26
- RawTriple objects for the pipeline.
25
+ Uses the T5-Gemma2 model to identify and extract atomic statements
26
+ from unstructured text. Each sentence can be converted to a
27
+ subject-predicate-object triple in Stage 2.
27
28
  """
28
29
 
29
30
  def __init__(
@@ -65,7 +66,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
65
66
 
66
67
  @property
67
68
  def description(self) -> str:
68
- return "T5-Gemma2 model for extracting triples using Diverse Beam Search"
69
+ return "T5-Gemma2 model for splitting text into atomic sentences"
69
70
 
70
71
  @property
71
72
  def model_vram_gb(self) -> float:
@@ -94,16 +95,16 @@ class T5GemmaSplitter(BaseSplitterPlugin):
94
95
  self,
95
96
  text: str,
96
97
  context: PipelineContext,
97
- ) -> list[RawTriple]:
98
+ ) -> list[SplitSentence]:
98
99
  """
99
- Split text into raw triples using T5-Gemma2.
100
+ Split text into atomic sentences using T5-Gemma2.
100
101
 
101
102
  Args:
102
103
  text: Input text to split
103
104
  context: Pipeline context
104
105
 
105
106
  Returns:
106
- List of RawTriple objects
107
+ List of SplitSentence objects
107
108
  """
108
109
  logger.debug(f"T5GemmaSplitter processing {len(text)} chars")
109
110
 
@@ -129,19 +130,19 @@ class T5GemmaSplitter(BaseSplitterPlugin):
129
130
  extractor = self._get_extractor()
130
131
  xml_output = extractor.extract_as_xml(text, options)
131
132
 
132
- # Parse XML to RawTriple objects
133
- raw_triples = self._parse_xml_to_raw_triples(xml_output)
133
+ # Parse XML to SplitSentence objects
134
+ sentences = self._parse_xml_to_sentences(xml_output)
134
135
 
135
- logger.info(f"T5GemmaSplitter produced {len(raw_triples)} raw triples")
136
- return raw_triples
136
+ logger.info(f"T5GemmaSplitter produced {len(sentences)} sentences")
137
+ return sentences
137
138
 
138
139
  def split_batch(
139
140
  self,
140
141
  texts: list[str],
141
142
  context: PipelineContext,
142
- ) -> list[list[RawTriple]]:
143
+ ) -> list[list[SplitSentence]]:
143
144
  """
144
- Split multiple texts into atomic triples using batch processing.
145
+ Split multiple texts into atomic sentences using batch processing.
145
146
 
146
147
  Processes all texts through the T5-Gemma2 model in batches
147
148
  sized for optimal GPU utilization.
@@ -151,7 +152,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
151
152
  context: Pipeline context
152
153
 
153
154
  Returns:
154
- List of RawTriple lists, one per input text
155
+ List of SplitSentence lists, one per input text
155
156
  """
156
157
  if not texts:
157
158
  return []
@@ -177,7 +178,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
177
178
  )
178
179
 
179
180
  extractor = self._get_extractor()
180
- all_results: list[list[RawTriple]] = []
181
+ all_results: list[list[SplitSentence]] = []
181
182
 
182
183
  # Process in batches
183
184
  for i in range(0, len(texts), batch_size):
@@ -187,8 +188,8 @@ class T5GemmaSplitter(BaseSplitterPlugin):
187
188
  batch_results = self._process_batch(batch_texts, extractor, options)
188
189
  all_results.extend(batch_results)
189
190
 
190
- total_triples = sum(len(r) for r in all_results)
191
- logger.info(f"T5GemmaSplitter batch produced {total_triples} total triples from {len(texts)} texts")
191
+ total_sentences = sum(len(r) for r in all_results)
192
+ logger.info(f"T5GemmaSplitter batch produced {total_sentences} total sentences from {len(texts)} texts")
192
193
  return all_results
193
194
 
194
195
  def _process_batch(
@@ -196,7 +197,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
196
197
  texts: list[str],
197
198
  extractor,
198
199
  options,
199
- ) -> list[list[RawTriple]]:
200
+ ) -> list[list[SplitSentence]]:
200
201
  """
201
202
  Process a batch of texts through the model.
202
203
 
@@ -249,7 +250,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
249
250
  )
250
251
 
251
252
  # Decode and parse each output
252
- results: list[list[RawTriple]] = []
253
+ results: list[list[SplitSentence]] = []
253
254
  end_tag = "</statements>"
254
255
 
255
256
  for output in outputs:
@@ -260,33 +261,28 @@ class T5GemmaSplitter(BaseSplitterPlugin):
260
261
  end_pos = decoded.find(end_tag) + len(end_tag)
261
262
  decoded = decoded[:end_pos]
262
263
 
263
- triples = self._parse_xml_to_raw_triples(decoded)
264
- results.append(triples)
264
+ sentences = self._parse_xml_to_sentences(decoded)
265
+ results.append(sentences)
265
266
 
266
267
  return results
267
268
 
268
269
  # Regex pattern to extract <text> content from <stmt> blocks
269
270
  _STMT_TEXT_PATTERN = re.compile(r'<stmt>.*?<text>(.*?)</text>.*?</stmt>', re.DOTALL)
270
271
 
271
- def _parse_xml_to_raw_triples(self, xml_output: str) -> list[RawTriple]:
272
- """Extract source sentences from <stmt><text>...</text></stmt> blocks."""
273
- raw_triples = []
272
+ def _parse_xml_to_sentences(self, xml_output: str) -> list[SplitSentence]:
273
+ """Extract atomic sentences from <stmt><text>...</text></stmt> blocks."""
274
+ sentences = []
274
275
 
275
276
  # Find all <text> content within <stmt> blocks
276
277
  text_matches = self._STMT_TEXT_PATTERN.findall(xml_output)
277
278
  logger.debug(f"Found {len(text_matches)} stmt text blocks via regex")
278
279
 
279
- for source_text in text_matches:
280
- source_text = source_text.strip()
281
- if source_text:
282
- raw_triples.append(RawTriple(
283
- subject_text="",
284
- predicate_text="",
285
- object_text="",
286
- source_sentence=source_text,
287
- ))
288
-
289
- return raw_triples
280
+ for sentence_text in text_matches:
281
+ sentence_text = sentence_text.strip()
282
+ if sentence_text:
283
+ sentences.append(SplitSentence(text=sentence_text))
284
+
285
+ return sentences
290
286
 
291
287
 
292
288
  # Allow importing without decorator for testing