corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
  3. statement_extractor/cli.py +866 -77
  4. statement_extractor/database/hub.py +35 -127
  5. statement_extractor/database/importers/__init__.py +10 -2
  6. statement_extractor/database/importers/companies_house.py +16 -2
  7. statement_extractor/database/importers/companies_house_officers.py +431 -0
  8. statement_extractor/database/importers/gleif.py +23 -0
  9. statement_extractor/database/importers/sec_edgar.py +17 -0
  10. statement_extractor/database/importers/sec_form4.py +512 -0
  11. statement_extractor/database/importers/wikidata.py +151 -43
  12. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  13. statement_extractor/database/importers/wikidata_people.py +823 -325
  14. statement_extractor/database/models.py +30 -6
  15. statement_extractor/database/store.py +1485 -60
  16. statement_extractor/document/deduplicator.py +10 -12
  17. statement_extractor/extractor.py +1 -1
  18. statement_extractor/models/__init__.py +3 -2
  19. statement_extractor/models/statement.py +15 -17
  20. statement_extractor/models.py +1 -1
  21. statement_extractor/pipeline/context.py +5 -5
  22. statement_extractor/pipeline/orchestrator.py +12 -12
  23. statement_extractor/plugins/base.py +17 -17
  24. statement_extractor/plugins/extractors/gliner2.py +28 -28
  25. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  26. statement_extractor/plugins/qualifiers/person.py +11 -1
  27. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  28. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  29. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,8 @@
1
1
  """
2
- T5GemmaSplitter - Stage 1 plugin that wraps the existing StatementExtractor.
2
+ T5GemmaSplitter - Stage 1 plugin that splits text into atomic sentences.
3
3
 
4
- Uses T5-Gemma2 model with Diverse Beam Search to generate high-quality
5
- subject-predicate-object triples from text.
4
+ Uses T5-Gemma2 model with Diverse Beam Search to split unstructured text
5
+ into atomic statements that can be converted to triples in Stage 2.
6
6
  """
7
7
 
8
8
  import logging
@@ -12,7 +12,7 @@ from typing import Optional
12
12
  from ..base import BaseSplitterPlugin, PluginCapability
13
13
  from ...pipeline.context import PipelineContext
14
14
  from ...pipeline.registry import PluginRegistry
15
- from ...models import RawTriple
15
+ from ...models import SplitSentence
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
@@ -20,10 +20,11 @@ logger = logging.getLogger(__name__)
20
20
  @PluginRegistry.splitter
21
21
  class T5GemmaSplitter(BaseSplitterPlugin):
22
22
  """
23
- Splitter plugin that uses T5-Gemma2 for triple extraction.
23
+ Splitter plugin that uses T5-Gemma2 to split text into atomic sentences.
24
24
 
25
- Wraps the existing StatementExtractor from extractor.py to produce
26
- RawTriple objects for the pipeline.
25
+ Uses the T5-Gemma2 model to identify and extract atomic statements
26
+ from unstructured text. Each sentence can be converted to a
27
+ subject-predicate-object triple in Stage 2.
27
28
  """
28
29
 
29
30
  def __init__(
@@ -65,7 +66,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
65
66
 
66
67
  @property
67
68
  def description(self) -> str:
68
- return "T5-Gemma2 model for extracting triples using Diverse Beam Search"
69
+ return "T5-Gemma2 model for splitting text into atomic sentences"
69
70
 
70
71
  @property
71
72
  def model_vram_gb(self) -> float:
@@ -94,16 +95,16 @@ class T5GemmaSplitter(BaseSplitterPlugin):
94
95
  self,
95
96
  text: str,
96
97
  context: PipelineContext,
97
- ) -> list[RawTriple]:
98
+ ) -> list[SplitSentence]:
98
99
  """
99
- Split text into raw triples using T5-Gemma2.
100
+ Split text into atomic sentences using T5-Gemma2.
100
101
 
101
102
  Args:
102
103
  text: Input text to split
103
104
  context: Pipeline context
104
105
 
105
106
  Returns:
106
- List of RawTriple objects
107
+ List of SplitSentence objects
107
108
  """
108
109
  logger.debug(f"T5GemmaSplitter processing {len(text)} chars")
109
110
 
@@ -129,19 +130,19 @@ class T5GemmaSplitter(BaseSplitterPlugin):
129
130
  extractor = self._get_extractor()
130
131
  xml_output = extractor.extract_as_xml(text, options)
131
132
 
132
- # Parse XML to RawTriple objects
133
- raw_triples = self._parse_xml_to_raw_triples(xml_output)
133
+ # Parse XML to SplitSentence objects
134
+ sentences = self._parse_xml_to_sentences(xml_output)
134
135
 
135
- logger.info(f"T5GemmaSplitter produced {len(raw_triples)} raw triples")
136
- return raw_triples
136
+ logger.info(f"T5GemmaSplitter produced {len(sentences)} sentences")
137
+ return sentences
137
138
 
138
139
  def split_batch(
139
140
  self,
140
141
  texts: list[str],
141
142
  context: PipelineContext,
142
- ) -> list[list[RawTriple]]:
143
+ ) -> list[list[SplitSentence]]:
143
144
  """
144
- Split multiple texts into atomic triples using batch processing.
145
+ Split multiple texts into atomic sentences using batch processing.
145
146
 
146
147
  Processes all texts through the T5-Gemma2 model in batches
147
148
  sized for optimal GPU utilization.
@@ -151,7 +152,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
151
152
  context: Pipeline context
152
153
 
153
154
  Returns:
154
- List of RawTriple lists, one per input text
155
+ List of SplitSentence lists, one per input text
155
156
  """
156
157
  if not texts:
157
158
  return []
@@ -177,7 +178,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
177
178
  )
178
179
 
179
180
  extractor = self._get_extractor()
180
- all_results: list[list[RawTriple]] = []
181
+ all_results: list[list[SplitSentence]] = []
181
182
 
182
183
  # Process in batches
183
184
  for i in range(0, len(texts), batch_size):
@@ -187,8 +188,8 @@ class T5GemmaSplitter(BaseSplitterPlugin):
187
188
  batch_results = self._process_batch(batch_texts, extractor, options)
188
189
  all_results.extend(batch_results)
189
190
 
190
- total_triples = sum(len(r) for r in all_results)
191
- logger.info(f"T5GemmaSplitter batch produced {total_triples} total triples from {len(texts)} texts")
191
+ total_sentences = sum(len(r) for r in all_results)
192
+ logger.info(f"T5GemmaSplitter batch produced {total_sentences} total sentences from {len(texts)} texts")
192
193
  return all_results
193
194
 
194
195
  def _process_batch(
@@ -196,7 +197,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
196
197
  texts: list[str],
197
198
  extractor,
198
199
  options,
199
- ) -> list[list[RawTriple]]:
200
+ ) -> list[list[SplitSentence]]:
200
201
  """
201
202
  Process a batch of texts through the model.
202
203
 
@@ -249,7 +250,7 @@ class T5GemmaSplitter(BaseSplitterPlugin):
249
250
  )
250
251
 
251
252
  # Decode and parse each output
252
- results: list[list[RawTriple]] = []
253
+ results: list[list[SplitSentence]] = []
253
254
  end_tag = "</statements>"
254
255
 
255
256
  for output in outputs:
@@ -260,33 +261,28 @@ class T5GemmaSplitter(BaseSplitterPlugin):
260
261
  end_pos = decoded.find(end_tag) + len(end_tag)
261
262
  decoded = decoded[:end_pos]
262
263
 
263
- triples = self._parse_xml_to_raw_triples(decoded)
264
- results.append(triples)
264
+ sentences = self._parse_xml_to_sentences(decoded)
265
+ results.append(sentences)
265
266
 
266
267
  return results
267
268
 
268
269
  # Regex pattern to extract <text> content from <stmt> blocks
269
270
  _STMT_TEXT_PATTERN = re.compile(r'<stmt>.*?<text>(.*?)</text>.*?</stmt>', re.DOTALL)
270
271
 
271
- def _parse_xml_to_raw_triples(self, xml_output: str) -> list[RawTriple]:
272
- """Extract source sentences from <stmt><text>...</text></stmt> blocks."""
273
- raw_triples = []
272
+ def _parse_xml_to_sentences(self, xml_output: str) -> list[SplitSentence]:
273
+ """Extract atomic sentences from <stmt><text>...</text></stmt> blocks."""
274
+ sentences = []
274
275
 
275
276
  # Find all <text> content within <stmt> blocks
276
277
  text_matches = self._STMT_TEXT_PATTERN.findall(xml_output)
277
278
  logger.debug(f"Found {len(text_matches)} stmt text blocks via regex")
278
279
 
279
- for source_text in text_matches:
280
- source_text = source_text.strip()
281
- if source_text:
282
- raw_triples.append(RawTriple(
283
- subject_text="",
284
- predicate_text="",
285
- object_text="",
286
- source_sentence=source_text,
287
- ))
288
-
289
- return raw_triples
280
+ for sentence_text in text_matches:
281
+ sentence_text = sentence_text.strip()
282
+ if sentence_text:
283
+ sentences.append(SplitSentence(text=sentence_text))
284
+
285
+ return sentences
290
286
 
291
287
 
292
288
  # Allow importing without decorator for testing