corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -9,7 +9,6 @@ Then searches the person database to find canonical matches for notable people
|
|
|
9
9
|
(those in Wikipedia/Wikidata), using extracted role/org to help disambiguate.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
import json
|
|
13
12
|
import logging
|
|
14
13
|
import re
|
|
15
14
|
from typing import Optional
|
|
@@ -44,11 +43,12 @@ Candidates from database (with Wikipedia info):
|
|
|
44
43
|
|
|
45
44
|
Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
|
|
46
45
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
-
|
|
50
|
-
|
|
51
|
-
|
|
46
|
+
IMPORTANT RULES:
|
|
47
|
+
1. The candidate name must closely match the extracted name "{query_name}"
|
|
48
|
+
2. Similar-sounding names are NOT matches (e.g., "Andy Vassies" does NOT match "Andy Jassy")
|
|
49
|
+
3. If no candidate has a name that matches "{query_name}", respond "NONE"
|
|
50
|
+
4. Consider role and organization context only AFTER confirming name match
|
|
51
|
+
5. When in doubt, prefer "NONE" over a wrong match
|
|
52
52
|
|
|
53
53
|
Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
|
|
54
54
|
"""
|
|
@@ -260,7 +260,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
260
260
|
if result and (result.role or result.org):
|
|
261
261
|
qualifiers = result
|
|
262
262
|
|
|
263
|
-
# Fallback to pattern matching
|
|
263
|
+
# Fallback to pattern matching (only if LLM extraction returned nothing)
|
|
264
264
|
if qualifiers is None:
|
|
265
265
|
qualifiers = self._extract_with_patterns(entity.text, full_text)
|
|
266
266
|
|
|
@@ -313,42 +313,79 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
313
313
|
if embedder is None:
|
|
314
314
|
return None
|
|
315
315
|
|
|
316
|
-
#
|
|
317
|
-
logger.debug(f"
|
|
318
|
-
|
|
316
|
+
# Log extracted context
|
|
317
|
+
logger.debug(f" Person search context:")
|
|
318
|
+
logger.debug(f" Name: '{person_name}'")
|
|
319
|
+
logger.debug(f" Extracted role: {extracted_role or '(none)'}")
|
|
320
|
+
logger.debug(f" Extracted org: {extracted_org or '(none)'}")
|
|
319
321
|
|
|
320
|
-
#
|
|
322
|
+
# Build query text with context for better embedding match
|
|
323
|
+
# This matches how PersonRecord.get_embedding_text() builds embedding text
|
|
324
|
+
query_parts = [person_name]
|
|
325
|
+
if extracted_role:
|
|
326
|
+
query_parts.append(extracted_role)
|
|
327
|
+
if extracted_org:
|
|
328
|
+
query_parts.append(extracted_org)
|
|
329
|
+
query_text = " | ".join(query_parts)
|
|
330
|
+
|
|
331
|
+
logger.debug(f" Embedding query: '{query_text}'")
|
|
332
|
+
query_embedding = embedder.embed(query_text)
|
|
333
|
+
|
|
334
|
+
# Search database with text pre-filtering on name only
|
|
321
335
|
logger.debug(f" Searching person database...")
|
|
322
336
|
results = database.search(
|
|
323
337
|
query_embedding,
|
|
324
|
-
top_k=self._top_k,
|
|
338
|
+
top_k=self._top_k * 3, # Fetch more to allow for org filtering
|
|
325
339
|
query_text=person_name,
|
|
326
340
|
)
|
|
327
341
|
|
|
342
|
+
logger.debug(f" Database returned {len(results)} raw results")
|
|
343
|
+
|
|
344
|
+
# If org was extracted, boost candidates that match the org
|
|
345
|
+
if extracted_org:
|
|
346
|
+
# Re-score with org preference
|
|
347
|
+
org_matched = []
|
|
348
|
+
org_unmatched = []
|
|
349
|
+
for record, sim in results:
|
|
350
|
+
if record.known_for_org and self._org_matches(extracted_org, record.known_for_org):
|
|
351
|
+
logger.debug(f" Org match: {record.name} at {record.known_for_org}")
|
|
352
|
+
org_matched.append((record, sim))
|
|
353
|
+
else:
|
|
354
|
+
org_unmatched.append((record, sim))
|
|
355
|
+
# Prioritize org matches
|
|
356
|
+
if org_matched:
|
|
357
|
+
logger.info(f" Found {len(org_matched)} candidates matching org '{extracted_org}'")
|
|
358
|
+
results = org_matched + org_unmatched
|
|
359
|
+
else:
|
|
360
|
+
logger.debug(f" No candidates match org '{extracted_org}'")
|
|
361
|
+
|
|
328
362
|
# Filter by minimum similarity
|
|
329
363
|
results = [(r, s) for r, s in results if s >= self._min_similarity]
|
|
364
|
+
logger.debug(f" After min_similarity filter ({self._min_similarity}): {len(results)} results")
|
|
330
365
|
|
|
331
366
|
if not results:
|
|
332
367
|
logger.debug(f" No person matches found above threshold {self._min_similarity}")
|
|
333
368
|
return None
|
|
334
369
|
|
|
335
|
-
# Boost scores based on role/org matching
|
|
370
|
+
# Boost scores based on name/role/org matching
|
|
336
371
|
scored_results = []
|
|
337
372
|
for record, similarity in results:
|
|
338
373
|
boosted_score = self._compute_match_score(
|
|
339
|
-
record, similarity, extracted_role, extracted_org
|
|
374
|
+
record, similarity, extracted_role, extracted_org, query_name=person_name
|
|
340
375
|
)
|
|
341
376
|
scored_results.append((record, similarity, boosted_score))
|
|
342
377
|
|
|
343
378
|
# Sort by boosted score
|
|
344
379
|
scored_results.sort(key=lambda x: x[2], reverse=True)
|
|
345
380
|
|
|
346
|
-
# Log top candidates
|
|
381
|
+
# Log top candidates with detailed context
|
|
347
382
|
logger.info(f" Found {len(scored_results)} candidates for '{person_name}':")
|
|
348
383
|
for i, (record, sim, boosted) in enumerate(scored_results[:5], 1):
|
|
349
384
|
role_str = f" ({record.known_for_role})" if record.known_for_role else ""
|
|
350
385
|
org_str = f" at {record.known_for_org}" if record.known_for_org else ""
|
|
351
|
-
|
|
386
|
+
boost_delta = boosted - sim
|
|
387
|
+
boost_info = f" [+{boost_delta:.3f} boost]" if boost_delta > 0 else ""
|
|
388
|
+
logger.info(f" {i}. {record.name}{role_str}{org_str} (sim={sim:.3f}, boosted={boosted:.3f}{boost_info})")
|
|
352
389
|
|
|
353
390
|
# Select best match using LLM if available
|
|
354
391
|
logger.info(f" Selecting best match (LLM={self._llm is not None})...")
|
|
@@ -373,6 +410,9 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
373
410
|
"similarity": similarity,
|
|
374
411
|
"known_for_role": record.known_for_role,
|
|
375
412
|
"known_for_org": record.known_for_org,
|
|
413
|
+
"birth_date": record.birth_date,
|
|
414
|
+
"death_date": record.death_date,
|
|
415
|
+
"is_historic": record.is_historic,
|
|
376
416
|
},
|
|
377
417
|
)
|
|
378
418
|
|
|
@@ -382,6 +422,7 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
382
422
|
embedding_similarity: float,
|
|
383
423
|
extracted_role: Optional[str],
|
|
384
424
|
extracted_org: Optional[str],
|
|
425
|
+
query_name: Optional[str] = None,
|
|
385
426
|
) -> float:
|
|
386
427
|
"""
|
|
387
428
|
Compute boosted match score using role/org context.
|
|
@@ -390,6 +431,14 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
390
431
|
"""
|
|
391
432
|
score = embedding_similarity
|
|
392
433
|
|
|
434
|
+
# Major boost for exact name match (normalized)
|
|
435
|
+
if query_name:
|
|
436
|
+
query_norm = self._normalize_person_name(query_name)
|
|
437
|
+
record_norm = self._normalize_person_name(record.name)
|
|
438
|
+
if query_norm == record_norm:
|
|
439
|
+
score += 0.25 # +25% boost for exact name match
|
|
440
|
+
logger.debug(f" Exact name match boost: '{query_name}' == '{record.name}'")
|
|
441
|
+
|
|
393
442
|
# Boost if role matches (fuzzy)
|
|
394
443
|
if extracted_role and record.known_for_role:
|
|
395
444
|
if self._role_matches(extracted_role, record.known_for_role):
|
|
@@ -455,6 +504,18 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
455
504
|
|
|
456
505
|
return False
|
|
457
506
|
|
|
507
|
+
def _normalize_person_name(self, name: str) -> str:
|
|
508
|
+
"""Normalize person name for comparison."""
|
|
509
|
+
# Lowercase and strip
|
|
510
|
+
normalized = name.lower().strip()
|
|
511
|
+
# Remove common titles
|
|
512
|
+
for title in ["dr.", "dr ", "mr.", "mr ", "mrs.", "mrs ", "ms.", "ms ", "prof.", "prof "]:
|
|
513
|
+
if normalized.startswith(title):
|
|
514
|
+
normalized = normalized[len(title):]
|
|
515
|
+
# Remove extra whitespace
|
|
516
|
+
normalized = " ".join(normalized.split())
|
|
517
|
+
return normalized
|
|
518
|
+
|
|
458
519
|
def _normalize_org_name(self, name: str) -> str:
|
|
459
520
|
"""Simple org name normalization."""
|
|
460
521
|
# Lowercase
|
|
@@ -524,8 +585,15 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
524
585
|
role_str = f", {record.known_for_role}" if record.known_for_role else ""
|
|
525
586
|
org_str = f" at {record.known_for_org}" if record.known_for_org else ""
|
|
526
587
|
country_str = f", {record.country}" if record.country else ""
|
|
588
|
+
# Include life dates for context (helps identify historic figures)
|
|
589
|
+
dates_parts = []
|
|
590
|
+
if record.birth_date:
|
|
591
|
+
dates_parts.append(f"b. {record.birth_date[:4]}") # Just year
|
|
592
|
+
if record.death_date:
|
|
593
|
+
dates_parts.append(f"d. {record.death_date[:4]}") # Just year
|
|
594
|
+
dates_str = f" [{' - '.join(dates_parts)}]" if dates_parts else ""
|
|
527
595
|
candidate_lines.append(
|
|
528
|
-
f"{i}. {record.name}{role_str}{org_str}{country_str} (score: {boosted:.2f})"
|
|
596
|
+
f"{i}. {record.name}{role_str}{org_str}{country_str}{dates_str} (score: {boosted:.2f})"
|
|
529
597
|
)
|
|
530
598
|
|
|
531
599
|
# Build context info from extracted role/org
|
|
@@ -582,52 +650,51 @@ class PersonQualifierPlugin(BaseQualifierPlugin):
|
|
|
582
650
|
person_name: str,
|
|
583
651
|
context_text: str,
|
|
584
652
|
) -> Optional[EntityQualifiers]:
|
|
585
|
-
"""Extract role and org using Gemma3."""
|
|
653
|
+
"""Extract role and org using Gemma3 with simple line-based output."""
|
|
586
654
|
if self._llm is None:
|
|
587
655
|
return None
|
|
588
656
|
|
|
589
657
|
try:
|
|
590
|
-
prompt = f"""Extract
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
- Return null for fields not mentioned in the context
|
|
596
|
-
|
|
597
|
-
Return ONLY valid JSON:
|
|
598
|
-
|
|
599
|
-
E.g.
|
|
600
|
-
<context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
|
|
601
|
-
<person>James</person>
|
|
602
|
-
|
|
603
|
-
Should return:
|
|
658
|
+
prompt = f"""Extract info about "{person_name}" from the text below.
|
|
659
|
+
Reply with exactly 3 lines:
|
|
660
|
+
NAME: the person's full name
|
|
661
|
+
ROLE: their job title (CEO, President, etc.) or NONE
|
|
662
|
+
ORG: the company/organization name or NONE
|
|
604
663
|
|
|
605
|
-
|
|
664
|
+
Text: {context_text[:500]}
|
|
606
665
|
|
|
607
|
-
|
|
666
|
+
NAME:"""
|
|
608
667
|
|
|
609
|
-
|
|
610
|
-
<person>{person_name}</person>
|
|
611
|
-
"""
|
|
612
|
-
|
|
613
|
-
logger.debug(f"LLM request: {prompt}")
|
|
668
|
+
logger.debug(f"LLM extraction prompt for '{person_name}'")
|
|
614
669
|
response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
|
|
615
670
|
logger.debug(f"LLM response: {response}")
|
|
616
671
|
|
|
617
|
-
#
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
if
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
672
|
+
# Parse line-based response
|
|
673
|
+
lines = response.strip().split("\n")
|
|
674
|
+
name = None
|
|
675
|
+
role = None
|
|
676
|
+
org = None
|
|
677
|
+
|
|
678
|
+
for line in lines:
|
|
679
|
+
line = line.strip()
|
|
680
|
+
if line.startswith("NAME:"):
|
|
681
|
+
name = line[5:].strip()
|
|
682
|
+
elif line.startswith("ROLE:"):
|
|
683
|
+
val = line[5:].strip()
|
|
684
|
+
if val.upper() != "NONE":
|
|
685
|
+
role = val
|
|
686
|
+
elif line.startswith("ORG:"):
|
|
687
|
+
val = line[4:].strip()
|
|
688
|
+
if val.upper() != "NONE":
|
|
689
|
+
org = val
|
|
690
|
+
# Handle case where first line is just the name (after our "NAME:" in prompt)
|
|
691
|
+
elif not name and line and not line.startswith(("ROLE", "ORG")):
|
|
692
|
+
name = line
|
|
693
|
+
|
|
694
|
+
logger.debug(f"LLM extracted: name={name!r}, role={role!r}, org={org!r}")
|
|
695
|
+
|
|
696
|
+
if role or org:
|
|
697
|
+
return EntityQualifiers(role=role, org=org)
|
|
631
698
|
|
|
632
699
|
except Exception as e:
|
|
633
700
|
logger.exception(f"LLM extraction failed: {e}")
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
|
-
T5GemmaSplitter - Stage 1 plugin that
|
|
2
|
+
T5GemmaSplitter - Stage 1 plugin that splits text into atomic sentences.
|
|
3
3
|
|
|
4
|
-
Uses T5-Gemma2 model with Diverse Beam Search to
|
|
5
|
-
|
|
4
|
+
Uses T5-Gemma2 model with Diverse Beam Search to split unstructured text
|
|
5
|
+
into atomic statements that can be converted to triples in Stage 2.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import logging
|
|
@@ -12,7 +12,7 @@ from typing import Optional
|
|
|
12
12
|
from ..base import BaseSplitterPlugin, PluginCapability
|
|
13
13
|
from ...pipeline.context import PipelineContext
|
|
14
14
|
from ...pipeline.registry import PluginRegistry
|
|
15
|
-
from ...models import
|
|
15
|
+
from ...models import SplitSentence
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
@@ -20,10 +20,11 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
@PluginRegistry.splitter
|
|
21
21
|
class T5GemmaSplitter(BaseSplitterPlugin):
|
|
22
22
|
"""
|
|
23
|
-
Splitter plugin that uses T5-Gemma2
|
|
23
|
+
Splitter plugin that uses T5-Gemma2 to split text into atomic sentences.
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
Uses the T5-Gemma2 model to identify and extract atomic statements
|
|
26
|
+
from unstructured text. Each sentence can be converted to a
|
|
27
|
+
subject-predicate-object triple in Stage 2.
|
|
27
28
|
"""
|
|
28
29
|
|
|
29
30
|
def __init__(
|
|
@@ -65,7 +66,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
65
66
|
|
|
66
67
|
@property
|
|
67
68
|
def description(self) -> str:
|
|
68
|
-
return "T5-Gemma2 model for
|
|
69
|
+
return "T5-Gemma2 model for splitting text into atomic sentences"
|
|
69
70
|
|
|
70
71
|
@property
|
|
71
72
|
def model_vram_gb(self) -> float:
|
|
@@ -94,16 +95,16 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
94
95
|
self,
|
|
95
96
|
text: str,
|
|
96
97
|
context: PipelineContext,
|
|
97
|
-
) -> list[
|
|
98
|
+
) -> list[SplitSentence]:
|
|
98
99
|
"""
|
|
99
|
-
Split text into
|
|
100
|
+
Split text into atomic sentences using T5-Gemma2.
|
|
100
101
|
|
|
101
102
|
Args:
|
|
102
103
|
text: Input text to split
|
|
103
104
|
context: Pipeline context
|
|
104
105
|
|
|
105
106
|
Returns:
|
|
106
|
-
List of
|
|
107
|
+
List of SplitSentence objects
|
|
107
108
|
"""
|
|
108
109
|
logger.debug(f"T5GemmaSplitter processing {len(text)} chars")
|
|
109
110
|
|
|
@@ -129,19 +130,19 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
129
130
|
extractor = self._get_extractor()
|
|
130
131
|
xml_output = extractor.extract_as_xml(text, options)
|
|
131
132
|
|
|
132
|
-
# Parse XML to
|
|
133
|
-
|
|
133
|
+
# Parse XML to SplitSentence objects
|
|
134
|
+
sentences = self._parse_xml_to_sentences(xml_output)
|
|
134
135
|
|
|
135
|
-
logger.info(f"T5GemmaSplitter produced {len(
|
|
136
|
-
return
|
|
136
|
+
logger.info(f"T5GemmaSplitter produced {len(sentences)} sentences")
|
|
137
|
+
return sentences
|
|
137
138
|
|
|
138
139
|
def split_batch(
|
|
139
140
|
self,
|
|
140
141
|
texts: list[str],
|
|
141
142
|
context: PipelineContext,
|
|
142
|
-
) -> list[list[
|
|
143
|
+
) -> list[list[SplitSentence]]:
|
|
143
144
|
"""
|
|
144
|
-
Split multiple texts into atomic
|
|
145
|
+
Split multiple texts into atomic sentences using batch processing.
|
|
145
146
|
|
|
146
147
|
Processes all texts through the T5-Gemma2 model in batches
|
|
147
148
|
sized for optimal GPU utilization.
|
|
@@ -151,7 +152,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
151
152
|
context: Pipeline context
|
|
152
153
|
|
|
153
154
|
Returns:
|
|
154
|
-
List of
|
|
155
|
+
List of SplitSentence lists, one per input text
|
|
155
156
|
"""
|
|
156
157
|
if not texts:
|
|
157
158
|
return []
|
|
@@ -177,7 +178,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
177
178
|
)
|
|
178
179
|
|
|
179
180
|
extractor = self._get_extractor()
|
|
180
|
-
all_results: list[list[
|
|
181
|
+
all_results: list[list[SplitSentence]] = []
|
|
181
182
|
|
|
182
183
|
# Process in batches
|
|
183
184
|
for i in range(0, len(texts), batch_size):
|
|
@@ -187,8 +188,8 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
187
188
|
batch_results = self._process_batch(batch_texts, extractor, options)
|
|
188
189
|
all_results.extend(batch_results)
|
|
189
190
|
|
|
190
|
-
|
|
191
|
-
logger.info(f"T5GemmaSplitter batch produced {
|
|
191
|
+
total_sentences = sum(len(r) for r in all_results)
|
|
192
|
+
logger.info(f"T5GemmaSplitter batch produced {total_sentences} total sentences from {len(texts)} texts")
|
|
192
193
|
return all_results
|
|
193
194
|
|
|
194
195
|
def _process_batch(
|
|
@@ -196,7 +197,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
196
197
|
texts: list[str],
|
|
197
198
|
extractor,
|
|
198
199
|
options,
|
|
199
|
-
) -> list[list[
|
|
200
|
+
) -> list[list[SplitSentence]]:
|
|
200
201
|
"""
|
|
201
202
|
Process a batch of texts through the model.
|
|
202
203
|
|
|
@@ -249,7 +250,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
249
250
|
)
|
|
250
251
|
|
|
251
252
|
# Decode and parse each output
|
|
252
|
-
results: list[list[
|
|
253
|
+
results: list[list[SplitSentence]] = []
|
|
253
254
|
end_tag = "</statements>"
|
|
254
255
|
|
|
255
256
|
for output in outputs:
|
|
@@ -260,33 +261,28 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
260
261
|
end_pos = decoded.find(end_tag) + len(end_tag)
|
|
261
262
|
decoded = decoded[:end_pos]
|
|
262
263
|
|
|
263
|
-
|
|
264
|
-
results.append(
|
|
264
|
+
sentences = self._parse_xml_to_sentences(decoded)
|
|
265
|
+
results.append(sentences)
|
|
265
266
|
|
|
266
267
|
return results
|
|
267
268
|
|
|
268
269
|
# Regex pattern to extract <text> content from <stmt> blocks
|
|
269
270
|
_STMT_TEXT_PATTERN = re.compile(r'<stmt>.*?<text>(.*?)</text>.*?</stmt>', re.DOTALL)
|
|
270
271
|
|
|
271
|
-
def
|
|
272
|
-
"""Extract
|
|
273
|
-
|
|
272
|
+
def _parse_xml_to_sentences(self, xml_output: str) -> list[SplitSentence]:
|
|
273
|
+
"""Extract atomic sentences from <stmt><text>...</text></stmt> blocks."""
|
|
274
|
+
sentences = []
|
|
274
275
|
|
|
275
276
|
# Find all <text> content within <stmt> blocks
|
|
276
277
|
text_matches = self._STMT_TEXT_PATTERN.findall(xml_output)
|
|
277
278
|
logger.debug(f"Found {len(text_matches)} stmt text blocks via regex")
|
|
278
279
|
|
|
279
|
-
for
|
|
280
|
-
|
|
281
|
-
if
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
object_text="",
|
|
286
|
-
source_sentence=source_text,
|
|
287
|
-
))
|
|
288
|
-
|
|
289
|
-
return raw_triples
|
|
280
|
+
for sentence_text in text_matches:
|
|
281
|
+
sentence_text = sentence_text.strip()
|
|
282
|
+
if sentence_text:
|
|
283
|
+
sentences.append(SplitSentence(text=sentence_text))
|
|
284
|
+
|
|
285
|
+
return sentences
|
|
290
286
|
|
|
291
287
|
|
|
292
288
|
# Allow importing without decorator for testing
|
|
File without changes
|
|
File without changes
|