corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
- statement_extractor/cli.py +866 -77
- statement_extractor/database/hub.py +35 -127
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +823 -325
- statement_extractor/database/models.py +30 -6
- statement_extractor/database/store.py +1485 -60
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +11 -1
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
|
-
T5GemmaSplitter - Stage 1 plugin that
|
|
2
|
+
T5GemmaSplitter - Stage 1 plugin that splits text into atomic sentences.
|
|
3
3
|
|
|
4
|
-
Uses T5-Gemma2 model with Diverse Beam Search to
|
|
5
|
-
|
|
4
|
+
Uses T5-Gemma2 model with Diverse Beam Search to split unstructured text
|
|
5
|
+
into atomic statements that can be converted to triples in Stage 2.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import logging
|
|
@@ -12,7 +12,7 @@ from typing import Optional
|
|
|
12
12
|
from ..base import BaseSplitterPlugin, PluginCapability
|
|
13
13
|
from ...pipeline.context import PipelineContext
|
|
14
14
|
from ...pipeline.registry import PluginRegistry
|
|
15
|
-
from ...models import
|
|
15
|
+
from ...models import SplitSentence
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
@@ -20,10 +20,11 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
@PluginRegistry.splitter
|
|
21
21
|
class T5GemmaSplitter(BaseSplitterPlugin):
|
|
22
22
|
"""
|
|
23
|
-
Splitter plugin that uses T5-Gemma2
|
|
23
|
+
Splitter plugin that uses T5-Gemma2 to split text into atomic sentences.
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
Uses the T5-Gemma2 model to identify and extract atomic statements
|
|
26
|
+
from unstructured text. Each sentence can be converted to a
|
|
27
|
+
subject-predicate-object triple in Stage 2.
|
|
27
28
|
"""
|
|
28
29
|
|
|
29
30
|
def __init__(
|
|
@@ -65,7 +66,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
65
66
|
|
|
66
67
|
@property
|
|
67
68
|
def description(self) -> str:
|
|
68
|
-
return "T5-Gemma2 model for
|
|
69
|
+
return "T5-Gemma2 model for splitting text into atomic sentences"
|
|
69
70
|
|
|
70
71
|
@property
|
|
71
72
|
def model_vram_gb(self) -> float:
|
|
@@ -94,16 +95,16 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
94
95
|
self,
|
|
95
96
|
text: str,
|
|
96
97
|
context: PipelineContext,
|
|
97
|
-
) -> list[
|
|
98
|
+
) -> list[SplitSentence]:
|
|
98
99
|
"""
|
|
99
|
-
Split text into
|
|
100
|
+
Split text into atomic sentences using T5-Gemma2.
|
|
100
101
|
|
|
101
102
|
Args:
|
|
102
103
|
text: Input text to split
|
|
103
104
|
context: Pipeline context
|
|
104
105
|
|
|
105
106
|
Returns:
|
|
106
|
-
List of
|
|
107
|
+
List of SplitSentence objects
|
|
107
108
|
"""
|
|
108
109
|
logger.debug(f"T5GemmaSplitter processing {len(text)} chars")
|
|
109
110
|
|
|
@@ -129,19 +130,19 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
129
130
|
extractor = self._get_extractor()
|
|
130
131
|
xml_output = extractor.extract_as_xml(text, options)
|
|
131
132
|
|
|
132
|
-
# Parse XML to
|
|
133
|
-
|
|
133
|
+
# Parse XML to SplitSentence objects
|
|
134
|
+
sentences = self._parse_xml_to_sentences(xml_output)
|
|
134
135
|
|
|
135
|
-
logger.info(f"T5GemmaSplitter produced {len(
|
|
136
|
-
return
|
|
136
|
+
logger.info(f"T5GemmaSplitter produced {len(sentences)} sentences")
|
|
137
|
+
return sentences
|
|
137
138
|
|
|
138
139
|
def split_batch(
|
|
139
140
|
self,
|
|
140
141
|
texts: list[str],
|
|
141
142
|
context: PipelineContext,
|
|
142
|
-
) -> list[list[
|
|
143
|
+
) -> list[list[SplitSentence]]:
|
|
143
144
|
"""
|
|
144
|
-
Split multiple texts into atomic
|
|
145
|
+
Split multiple texts into atomic sentences using batch processing.
|
|
145
146
|
|
|
146
147
|
Processes all texts through the T5-Gemma2 model in batches
|
|
147
148
|
sized for optimal GPU utilization.
|
|
@@ -151,7 +152,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
151
152
|
context: Pipeline context
|
|
152
153
|
|
|
153
154
|
Returns:
|
|
154
|
-
List of
|
|
155
|
+
List of SplitSentence lists, one per input text
|
|
155
156
|
"""
|
|
156
157
|
if not texts:
|
|
157
158
|
return []
|
|
@@ -177,7 +178,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
177
178
|
)
|
|
178
179
|
|
|
179
180
|
extractor = self._get_extractor()
|
|
180
|
-
all_results: list[list[
|
|
181
|
+
all_results: list[list[SplitSentence]] = []
|
|
181
182
|
|
|
182
183
|
# Process in batches
|
|
183
184
|
for i in range(0, len(texts), batch_size):
|
|
@@ -187,8 +188,8 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
187
188
|
batch_results = self._process_batch(batch_texts, extractor, options)
|
|
188
189
|
all_results.extend(batch_results)
|
|
189
190
|
|
|
190
|
-
|
|
191
|
-
logger.info(f"T5GemmaSplitter batch produced {
|
|
191
|
+
total_sentences = sum(len(r) for r in all_results)
|
|
192
|
+
logger.info(f"T5GemmaSplitter batch produced {total_sentences} total sentences from {len(texts)} texts")
|
|
192
193
|
return all_results
|
|
193
194
|
|
|
194
195
|
def _process_batch(
|
|
@@ -196,7 +197,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
196
197
|
texts: list[str],
|
|
197
198
|
extractor,
|
|
198
199
|
options,
|
|
199
|
-
) -> list[list[
|
|
200
|
+
) -> list[list[SplitSentence]]:
|
|
200
201
|
"""
|
|
201
202
|
Process a batch of texts through the model.
|
|
202
203
|
|
|
@@ -249,7 +250,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
249
250
|
)
|
|
250
251
|
|
|
251
252
|
# Decode and parse each output
|
|
252
|
-
results: list[list[
|
|
253
|
+
results: list[list[SplitSentence]] = []
|
|
253
254
|
end_tag = "</statements>"
|
|
254
255
|
|
|
255
256
|
for output in outputs:
|
|
@@ -260,33 +261,28 @@ class T5GemmaSplitter(BaseSplitterPlugin):
|
|
|
260
261
|
end_pos = decoded.find(end_tag) + len(end_tag)
|
|
261
262
|
decoded = decoded[:end_pos]
|
|
262
263
|
|
|
263
|
-
|
|
264
|
-
results.append(
|
|
264
|
+
sentences = self._parse_xml_to_sentences(decoded)
|
|
265
|
+
results.append(sentences)
|
|
265
266
|
|
|
266
267
|
return results
|
|
267
268
|
|
|
268
269
|
# Regex pattern to extract <text> content from <stmt> blocks
|
|
269
270
|
_STMT_TEXT_PATTERN = re.compile(r'<stmt>.*?<text>(.*?)</text>.*?</stmt>', re.DOTALL)
|
|
270
271
|
|
|
271
|
-
def
|
|
272
|
-
"""Extract
|
|
273
|
-
|
|
272
|
+
def _parse_xml_to_sentences(self, xml_output: str) -> list[SplitSentence]:
|
|
273
|
+
"""Extract atomic sentences from <stmt><text>...</text></stmt> blocks."""
|
|
274
|
+
sentences = []
|
|
274
275
|
|
|
275
276
|
# Find all <text> content within <stmt> blocks
|
|
276
277
|
text_matches = self._STMT_TEXT_PATTERN.findall(xml_output)
|
|
277
278
|
logger.debug(f"Found {len(text_matches)} stmt text blocks via regex")
|
|
278
279
|
|
|
279
|
-
for
|
|
280
|
-
|
|
281
|
-
if
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
object_text="",
|
|
286
|
-
source_sentence=source_text,
|
|
287
|
-
))
|
|
288
|
-
|
|
289
|
-
return raw_triples
|
|
280
|
+
for sentence_text in text_matches:
|
|
281
|
+
sentence_text = sentence_text.strip()
|
|
282
|
+
if sentence_text:
|
|
283
|
+
sentences.append(SplitSentence(text=sentence_text))
|
|
284
|
+
|
|
285
|
+
return sentences
|
|
290
286
|
|
|
291
287
|
|
|
292
288
|
# Allow importing without decorator for testing
|
|
File without changes
|
|
File without changes
|