corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -11,10 +11,18 @@ import json
|
|
|
11
11
|
import logging
|
|
12
12
|
import time
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Optional
|
|
14
|
+
from typing import Optional, TypedDict
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
|
|
18
|
+
|
|
19
|
+
class TaxonomyEntry(TypedDict):
|
|
20
|
+
"""Structure for each taxonomy label entry."""
|
|
21
|
+
description: str
|
|
22
|
+
id: int
|
|
23
|
+
mnli_label: str
|
|
24
|
+
embedding_label: str
|
|
25
|
+
|
|
18
26
|
from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
|
|
19
27
|
from ...pipeline.context import PipelineContext
|
|
20
28
|
from ...pipeline.registry import PluginRegistry
|
|
@@ -96,7 +104,7 @@ class EmbeddingClassifier:
|
|
|
96
104
|
|
|
97
105
|
def precompute_label_embeddings(
|
|
98
106
|
self,
|
|
99
|
-
taxonomy: dict[str, dict[str,
|
|
107
|
+
taxonomy: dict[str, dict[str, TaxonomyEntry]],
|
|
100
108
|
categories: Optional[list[str]] = None,
|
|
101
109
|
) -> None:
|
|
102
110
|
"""Pre-compute embeddings for all label names."""
|
|
@@ -137,68 +145,127 @@ class EmbeddingClassifier:
|
|
|
137
145
|
exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
|
|
138
146
|
return 1.0 / (1.0 + np.exp(exponent))
|
|
139
147
|
|
|
140
|
-
def
|
|
148
|
+
def encode_batch(self, texts: list[str]) -> np.ndarray:
|
|
149
|
+
"""
|
|
150
|
+
Encode multiple texts into normalized embeddings in a single batch.
|
|
151
|
+
|
|
152
|
+
Uses caching to avoid re-encoding previously seen texts.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
texts: List of texts to encode
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
2D numpy array of shape (len(texts), embedding_dim) with normalized embeddings
|
|
159
|
+
"""
|
|
160
|
+
self._load_model()
|
|
161
|
+
|
|
162
|
+
# Separate cached from uncached texts
|
|
163
|
+
uncached_indices = []
|
|
164
|
+
uncached_texts = []
|
|
165
|
+
for i, text in enumerate(texts):
|
|
166
|
+
if text not in self._text_embedding_cache:
|
|
167
|
+
uncached_indices.append(i)
|
|
168
|
+
uncached_texts.append(text)
|
|
169
|
+
|
|
170
|
+
# Batch encode uncached texts
|
|
171
|
+
if uncached_texts:
|
|
172
|
+
embeddings = self._model.encode(uncached_texts, convert_to_numpy=True, show_progress_bar=False)
|
|
173
|
+
for i, (text, embedding) in enumerate(zip(uncached_texts, embeddings)):
|
|
174
|
+
norm = np.linalg.norm(embedding)
|
|
175
|
+
normalized = (embedding / (norm + 1e-8)).astype(np.float32)
|
|
176
|
+
self._text_embedding_cache[text] = normalized
|
|
177
|
+
|
|
178
|
+
logger.debug(f"Batch encoded {len(uncached_texts)} texts (cache size: {len(self._text_embedding_cache)})")
|
|
179
|
+
|
|
180
|
+
# Build result array from cache
|
|
181
|
+
result = np.stack([self._text_embedding_cache[text] for text in texts])
|
|
182
|
+
return result
|
|
183
|
+
|
|
184
|
+
def classify_batch(
|
|
141
185
|
self,
|
|
142
|
-
|
|
186
|
+
texts: list[str],
|
|
143
187
|
top_k_categories: int = 3,
|
|
144
188
|
min_score: float = 0.3,
|
|
145
|
-
) -> list[tuple[str, str, float]]:
|
|
146
|
-
"""
|
|
147
|
-
|
|
148
|
-
Returns all labels above the threshold, not just the best match.
|
|
189
|
+
) -> list[list[tuple[str, str, float]]]:
|
|
190
|
+
"""
|
|
191
|
+
Classify multiple texts in a single batch for efficiency.
|
|
149
192
|
|
|
150
193
|
Args:
|
|
151
|
-
|
|
152
|
-
top_k_categories: Number of top categories to consider
|
|
194
|
+
texts: List of texts to classify
|
|
195
|
+
top_k_categories: Number of top categories to consider per text
|
|
153
196
|
min_score: Minimum calibrated score to include in results
|
|
154
197
|
|
|
155
198
|
Returns:
|
|
156
|
-
List of
|
|
199
|
+
List of classification results, one list per input text
|
|
157
200
|
"""
|
|
201
|
+
if not texts:
|
|
202
|
+
return []
|
|
203
|
+
|
|
158
204
|
self._load_model()
|
|
159
205
|
|
|
160
206
|
if not self._label_embeddings:
|
|
161
207
|
raise RuntimeError("Label embeddings not pre-computed.")
|
|
162
208
|
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
input_normalized = self._text_embedding_cache[text]
|
|
166
|
-
else:
|
|
167
|
-
input_embedding = self._model.encode(text, convert_to_numpy=True, show_progress_bar=False)
|
|
168
|
-
input_norm = np.linalg.norm(input_embedding)
|
|
169
|
-
input_normalized = (input_embedding / (input_norm + 1e-8)).astype(np.float32)
|
|
170
|
-
self._text_embedding_cache[text] = input_normalized
|
|
171
|
-
logger.debug(f"Cached embedding for text: '{text[:50]}...' (cache size: {len(self._text_embedding_cache)})")
|
|
172
|
-
|
|
173
|
-
# Compute average similarity to each category
|
|
174
|
-
category_scores: list[tuple[str, float]] = []
|
|
175
|
-
for category, labels in self._label_embeddings.items():
|
|
176
|
-
if not labels:
|
|
177
|
-
continue
|
|
209
|
+
# Batch encode all texts
|
|
210
|
+
input_embeddings = self.encode_batch(texts)
|
|
178
211
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
sim = float(np.dot(input_normalized, label_embedding))
|
|
182
|
-
sims.append(sim)
|
|
212
|
+
# Prepare label embeddings as matrices for vectorized similarity
|
|
213
|
+
all_results: list[list[tuple[str, str, float]]] = []
|
|
183
214
|
|
|
184
|
-
|
|
185
|
-
|
|
215
|
+
for input_normalized in input_embeddings:
|
|
216
|
+
# Compute average similarity to each category
|
|
217
|
+
category_scores: list[tuple[str, float]] = []
|
|
218
|
+
for category, labels in self._label_embeddings.items():
|
|
219
|
+
if not labels:
|
|
220
|
+
continue
|
|
186
221
|
|
|
187
|
-
|
|
222
|
+
sims = []
|
|
223
|
+
for label_embedding in labels.values():
|
|
224
|
+
sim = float(np.dot(input_normalized, label_embedding))
|
|
225
|
+
sims.append(sim)
|
|
188
226
|
|
|
189
|
-
|
|
227
|
+
avg_sim = np.mean(sims)
|
|
228
|
+
category_scores.append((category, avg_sim))
|
|
190
229
|
|
|
191
|
-
|
|
192
|
-
for label, label_embedding in self._label_embeddings[category].items():
|
|
193
|
-
raw_sim = float(np.dot(input_normalized, label_embedding))
|
|
194
|
-
calibrated_score = self._calibrate_score(raw_sim)
|
|
230
|
+
category_scores.sort(key=lambda x: x[1], reverse=True)
|
|
195
231
|
|
|
196
|
-
|
|
197
|
-
results.append((category, label, calibrated_score))
|
|
232
|
+
results: list[tuple[str, str, float]] = []
|
|
198
233
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
234
|
+
for category, _ in category_scores[:top_k_categories]:
|
|
235
|
+
for label, label_embedding in self._label_embeddings[category].items():
|
|
236
|
+
raw_sim = float(np.dot(input_normalized, label_embedding))
|
|
237
|
+
calibrated_score = self._calibrate_score(raw_sim)
|
|
238
|
+
|
|
239
|
+
if calibrated_score >= min_score:
|
|
240
|
+
results.append((category, label, calibrated_score))
|
|
241
|
+
|
|
242
|
+
# Sort by confidence descending
|
|
243
|
+
results.sort(key=lambda x: x[2], reverse=True)
|
|
244
|
+
all_results.append(results)
|
|
245
|
+
|
|
246
|
+
return all_results
|
|
247
|
+
|
|
248
|
+
def classify_hierarchical(
|
|
249
|
+
self,
|
|
250
|
+
text: str,
|
|
251
|
+
top_k_categories: int = 3,
|
|
252
|
+
min_score: float = 0.3,
|
|
253
|
+
) -> list[tuple[str, str, float]]:
|
|
254
|
+
"""Hierarchical classification: find categories, then all labels above threshold.
|
|
255
|
+
|
|
256
|
+
Returns all labels above the threshold, not just the best match.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
text: Text to classify
|
|
260
|
+
top_k_categories: Number of top categories to consider
|
|
261
|
+
min_score: Minimum calibrated score to include in results
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
List of (category, label, confidence) tuples above threshold
|
|
265
|
+
"""
|
|
266
|
+
# Use batch method for single text
|
|
267
|
+
results = self.classify_batch([text], top_k_categories, min_score)
|
|
268
|
+
return results[0] if results else []
|
|
202
269
|
|
|
203
270
|
|
|
204
271
|
@PluginRegistry.taxonomy
|
|
@@ -223,7 +290,7 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
223
290
|
self._top_k_categories = top_k_categories
|
|
224
291
|
self._min_confidence = min_confidence
|
|
225
292
|
|
|
226
|
-
self._taxonomy: Optional[dict[str, dict[str,
|
|
293
|
+
self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
|
|
227
294
|
self._classifier: Optional[EmbeddingClassifier] = None
|
|
228
295
|
self._embeddings_computed = False
|
|
229
296
|
|
|
@@ -243,6 +310,16 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
243
310
|
def description(self) -> str:
|
|
244
311
|
return "Classifies statements using embedding similarity (faster than MNLI)"
|
|
245
312
|
|
|
313
|
+
@property
|
|
314
|
+
def model_vram_gb(self) -> float:
|
|
315
|
+
"""EmbeddingGemma model weights ~1.2GB."""
|
|
316
|
+
return 1.2
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def per_item_vram_gb(self) -> float:
|
|
320
|
+
"""Each text embedding ~0.05GB (embeddings are small)."""
|
|
321
|
+
return 0.05
|
|
322
|
+
|
|
246
323
|
@property
|
|
247
324
|
def taxonomy_name(self) -> str:
|
|
248
325
|
return "esg_topics_embedding"
|
|
@@ -262,7 +339,7 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
262
339
|
def supported_categories(self) -> list[str]:
|
|
263
340
|
return self._categories.copy()
|
|
264
341
|
|
|
265
|
-
def _load_taxonomy(self) -> dict[str, dict[str,
|
|
342
|
+
def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
|
|
266
343
|
if self._taxonomy is not None:
|
|
267
344
|
return self._taxonomy
|
|
268
345
|
|
|
@@ -329,9 +406,79 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
329
406
|
def _get_label_id(self, category: str, label: str) -> Optional[int]:
|
|
330
407
|
taxonomy = self._load_taxonomy()
|
|
331
408
|
if category in taxonomy:
|
|
332
|
-
|
|
409
|
+
entry = taxonomy[category].get(label)
|
|
410
|
+
if entry:
|
|
411
|
+
return entry.get("id")
|
|
333
412
|
return None
|
|
334
413
|
|
|
414
|
+
def classify_batch(
|
|
415
|
+
self,
|
|
416
|
+
items: list[tuple[PipelineStatement, CanonicalEntity, CanonicalEntity]],
|
|
417
|
+
context: PipelineContext,
|
|
418
|
+
) -> list[list[TaxonomyResult]]:
|
|
419
|
+
"""
|
|
420
|
+
Classify multiple statements in a single batch for efficiency.
|
|
421
|
+
|
|
422
|
+
Batch encodes all source texts, then classifies each against the taxonomy.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
items: List of (statement, subject_canonical, object_canonical) tuples
|
|
426
|
+
context: Pipeline context
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
List of TaxonomyResult lists, one per input statement
|
|
430
|
+
"""
|
|
431
|
+
if not items:
|
|
432
|
+
return []
|
|
433
|
+
|
|
434
|
+
# Extract unique source texts (may have duplicates across statements)
|
|
435
|
+
texts = [stmt.source_text for stmt, _, _ in items]
|
|
436
|
+
unique_texts = list(set(texts))
|
|
437
|
+
|
|
438
|
+
logger.info(f"Batch classifying {len(items)} statements ({len(unique_texts)} unique texts)")
|
|
439
|
+
|
|
440
|
+
try:
|
|
441
|
+
classifier = self._get_classifier()
|
|
442
|
+
|
|
443
|
+
# Batch classify all unique texts
|
|
444
|
+
batch_results = classifier.classify_batch(
|
|
445
|
+
unique_texts,
|
|
446
|
+
top_k_categories=self._top_k_categories,
|
|
447
|
+
min_score=self._min_confidence,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Map unique texts to their classifications
|
|
451
|
+
text_to_results: dict[str, list[tuple[str, str, float]]] = {
|
|
452
|
+
text: results for text, results in zip(unique_texts, batch_results)
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
# Build results for each input statement
|
|
456
|
+
all_results: list[list[TaxonomyResult]] = []
|
|
457
|
+
for stmt, _, _ in items:
|
|
458
|
+
classifications = text_to_results.get(stmt.source_text, [])
|
|
459
|
+
|
|
460
|
+
results: list[TaxonomyResult] = []
|
|
461
|
+
for category, label, confidence in classifications:
|
|
462
|
+
label_id = self._get_label_id(category, label)
|
|
463
|
+
|
|
464
|
+
results.append(TaxonomyResult(
|
|
465
|
+
taxonomy_name=self.taxonomy_name,
|
|
466
|
+
category=category,
|
|
467
|
+
label=label,
|
|
468
|
+
label_id=label_id,
|
|
469
|
+
confidence=round(confidence, 4),
|
|
470
|
+
classifier=self.name,
|
|
471
|
+
))
|
|
472
|
+
|
|
473
|
+
all_results.append(results)
|
|
474
|
+
|
|
475
|
+
return all_results
|
|
476
|
+
|
|
477
|
+
except Exception as e:
|
|
478
|
+
logger.warning(f"Batch taxonomy classification failed: {e}")
|
|
479
|
+
# Return empty results for all items
|
|
480
|
+
return [[] for _ in items]
|
|
481
|
+
|
|
335
482
|
|
|
336
483
|
# For testing without decorator
|
|
337
484
|
EmbeddingTaxonomyClassifierClass = EmbeddingTaxonomyClassifier
|
|
@@ -8,9 +8,19 @@ where there are too many possible values for simple multi-choice classification.
|
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Optional
|
|
11
|
+
from typing import Optional, TypedDict
|
|
12
12
|
|
|
13
13
|
from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TaxonomyEntry(TypedDict):
|
|
17
|
+
"""Structure for each taxonomy label entry."""
|
|
18
|
+
description: str
|
|
19
|
+
id: int
|
|
20
|
+
mnli_label: str
|
|
21
|
+
embedding_label: str
|
|
22
|
+
|
|
23
|
+
|
|
14
24
|
from ...pipeline.context import PipelineContext
|
|
15
25
|
from ...pipeline.registry import PluginRegistry
|
|
16
26
|
from ...models import (
|
|
@@ -160,7 +170,7 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
160
170
|
self._top_k_categories = top_k_categories
|
|
161
171
|
self._min_confidence = min_confidence
|
|
162
172
|
|
|
163
|
-
self._taxonomy: Optional[dict[str, dict[str,
|
|
173
|
+
self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
|
|
164
174
|
self._classifier: Optional[MNLIClassifier] = None
|
|
165
175
|
|
|
166
176
|
@property
|
|
@@ -198,7 +208,7 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
198
208
|
def supported_categories(self) -> list[str]:
|
|
199
209
|
return self._categories.copy()
|
|
200
210
|
|
|
201
|
-
def _load_taxonomy(self) -> dict[str, dict[str,
|
|
211
|
+
def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
|
|
202
212
|
"""Load taxonomy from JSON file."""
|
|
203
213
|
if self._taxonomy is not None:
|
|
204
214
|
return self._taxonomy
|
|
@@ -271,7 +281,9 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
|
|
|
271
281
|
def _get_label_id(self, category: str, label: str) -> Optional[int]:
|
|
272
282
|
taxonomy = self._load_taxonomy()
|
|
273
283
|
if category in taxonomy:
|
|
274
|
-
|
|
284
|
+
entry = taxonomy[category].get(label)
|
|
285
|
+
if entry:
|
|
286
|
+
return entry.get("id")
|
|
275
287
|
return None
|
|
276
288
|
|
|
277
289
|
|
statement_extractor/scoring.py
CHANGED
|
@@ -409,18 +409,18 @@ class BeamScorer:
|
|
|
409
409
|
filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
|
|
410
410
|
logger.debug(f" After confidence filter (>={min_conf}): {len(filtered)} statements")
|
|
411
411
|
|
|
412
|
-
#
|
|
413
|
-
#
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
412
|
+
# Filter out statements where source_text doesn't support the predicate
|
|
413
|
+
# This catches model hallucinations where predicate doesn't match the evidence
|
|
414
|
+
consistent = [
|
|
415
|
+
s for s in filtered
|
|
416
|
+
if self._source_text_supports_predicate(s)
|
|
417
|
+
]
|
|
418
|
+
logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
|
|
419
419
|
|
|
420
420
|
# Deduplicate - keep highest confidence for each (subject, predicate, object)
|
|
421
421
|
# Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
|
|
422
422
|
seen: dict[tuple[str, str, str], Statement] = {}
|
|
423
|
-
for stmt in
|
|
423
|
+
for stmt in consistent:
|
|
424
424
|
key = (
|
|
425
425
|
stmt.subject.text.lower(),
|
|
426
426
|
stmt.predicate.lower(),
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
statement_extractor/__init__.py,sha256=Lmgw3jtwrfu09mXSfNFCB5AN0J6tsEQ2uOrrQciMrtI,3215
|
|
2
|
-
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
-
statement_extractor/cli.py,sha256=iqsqvLAN0FMRoE4KskEoW-4DE5_7Tll8xeHA1t04KJg,25028
|
|
4
|
-
statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
|
|
5
|
-
statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
|
|
6
|
-
statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
|
|
7
|
-
statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
|
|
8
|
-
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
9
|
-
statement_extractor/scoring.py,sha256=s_8nhavBNzPPFmGf2FyBummH4tgP7YGpXoMhl2Jh3Xw,16650
|
|
10
|
-
statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
|
|
11
|
-
statement_extractor/data/statement_taxonomy.json,sha256=XhCeVBC4aQB-7NR40Niu4yN2BmL0c2Gd-RKkUpsYK24,37981
|
|
12
|
-
statement_extractor/models/__init__.py,sha256=gjTu450FPe9dvhIVQXqBwF8u0hgSnPORGXzxmSEuCnM,2564
|
|
13
|
-
statement_extractor/models/canonical.py,sha256=ld6z6RtK03iOs_aUk8Rftcm0pUoaFpLUfyfbKI26N_o,4354
|
|
14
|
-
statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
|
|
15
|
-
statement_extractor/models/labels.py,sha256=e-mFDuzb42oJ69gLZTWCdg5_MNqRftQ2La5x8y9Cv-Y,6236
|
|
16
|
-
statement_extractor/models/qualifiers.py,sha256=YkvyWh2p1fK5iMRDC2Dq1r-XJOmJ1rvWFTFUIkQ9zcc,3495
|
|
17
|
-
statement_extractor/models/statement.py,sha256=cOgabA7IJxHYjlH5AksJRNf2Rv5VScMPqZdfjQyXRN0,2733
|
|
18
|
-
statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
|
|
19
|
-
statement_extractor/pipeline/config.py,sha256=rxZN27OWp05F-NaatwrYkjp56zbzHZ0hMtNU1mvBxgw,4130
|
|
20
|
-
statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
|
|
21
|
-
statement_extractor/pipeline/orchestrator.py,sha256=oHegnsDzXj87q8iAoi-QZj2ZyB1rX5qmg57BdIjvKo0,17617
|
|
22
|
-
statement_extractor/pipeline/registry.py,sha256=qj5M5tMm9GmNCguy8dWBXMT8XmhemiZjJMktZsRlevw,11415
|
|
23
|
-
statement_extractor/plugins/__init__.py,sha256=8k3lQGQNQSMUzxCmk4nAH8dIc1DqEnMyiqHlZZv81q0,1099
|
|
24
|
-
statement_extractor/plugins/base.py,sha256=GZ4WT5S2mH3C_uN6nyBz-nGlAn_Z2o2A51FSRu6gCEo,12797
|
|
25
|
-
statement_extractor/plugins/canonicalizers/__init__.py,sha256=LDb9NodyuLSoLzrLnNzMeviK79GHnyaLGU0J_02BBgM,421
|
|
26
|
-
statement_extractor/plugins/canonicalizers/base.py,sha256=dbreQuEPB48eBJmah7hpl67azVU4QLhbvSrjXr0vT88,195
|
|
27
|
-
statement_extractor/plugins/canonicalizers/location.py,sha256=Rz5SCM4bb0p0gsnHPzsQJv-RN59yoj9Z1NmF8yLQNv0,6590
|
|
28
|
-
statement_extractor/plugins/canonicalizers/organization.py,sha256=L-mhdctkRXuu84RsNHp80M_tDIiMumYaHAG6WfxpH4c,7482
|
|
29
|
-
statement_extractor/plugins/canonicalizers/person.py,sha256=Nw8FuJOBmg-cTaOTd2BJ1TZtydprfzIKL25wJa_VJek,6944
|
|
30
|
-
statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
|
|
31
|
-
statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
|
|
32
|
-
statement_extractor/plugins/extractors/gliner2.py,sha256=rgfY8l9v8EWCxfB3g6hLnmLCIekTBkfWMG8dgSAZu-E,21627
|
|
33
|
-
statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
|
|
34
|
-
statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
|
|
35
|
-
statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
|
|
36
|
-
statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
|
|
37
|
-
statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
|
|
38
|
-
statement_extractor/plugins/labelers/taxonomy.py,sha256=jQp5emgWf6XgmOx7arh-owF_-TjVxiPKSJ2OGkTPbBs,12427
|
|
39
|
-
statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=grvC_R_sg05hR6l0DgaELy2wmf6OkbvV1pRuNU0FVk4,16027
|
|
40
|
-
statement_extractor/plugins/qualifiers/__init__.py,sha256=kefjGunlVDKLy2NXmtr5ZXyYi-swyQdPLkB-tHV_0vk,495
|
|
41
|
-
statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
|
|
42
|
-
statement_extractor/plugins/qualifiers/companies_house.py,sha256=_6ExJCjD0V4eZNYXtfBY99obqLpRaSv-G-V7N6R1wLg,5376
|
|
43
|
-
statement_extractor/plugins/qualifiers/gleif.py,sha256=WZqcNT_Yq4yVe4rdkWO59C9yZ4geV2ZTDk9wxLlOeTg,5645
|
|
44
|
-
statement_extractor/plugins/qualifiers/person.py,sha256=si_9CLjHsH9jYFugej4t0HMnsivclh-Yi70U6NglfIU,7101
|
|
45
|
-
statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=3XDbizlR9YQgLrC7p-owV8Td-3TYaJlMb4B7saha3vw,6288
|
|
46
|
-
statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
|
|
47
|
-
statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
|
|
48
|
-
statement_extractor/plugins/splitters/t5_gemma.py,sha256=8joOzlMKXhSyJaq5c3F8t-gdPcZEDiVAzNcMlgJAqsE,6733
|
|
49
|
-
statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
|
|
50
|
-
statement_extractor/plugins/taxonomy/embedding.py,sha256=QW1RR07JoE8Ah97gDZ_w_ATEe6-z2t2nl1zeTDAgFjM,11347
|
|
51
|
-
statement_extractor/plugins/taxonomy/mnli.py,sha256=IzLjHXUFgVAgEvYI5EzOBs19UxvpcbJa8HjqI__tYII,8905
|
|
52
|
-
corp_extractor-0.5.0.dist-info/METADATA,sha256=H4Z8ExZFdbknpHg-EZ1P9B137hCPwKXBezHSF7X9EOE,21567
|
|
53
|
-
corp_extractor-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
54
|
-
corp_extractor-0.5.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
55
|
-
corp_extractor-0.5.0.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Canonicalizer plugins for Stage 4 (Canonicalization).
|
|
3
|
-
|
|
4
|
-
Resolves entities to their canonical forms.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from .base import BaseCanonicalizerPlugin
|
|
8
|
-
from .organization import OrganizationCanonicalizer
|
|
9
|
-
from .person import PersonCanonicalizer
|
|
10
|
-
from .location import LocationCanonicalizer
|
|
11
|
-
|
|
12
|
-
__all__ = [
|
|
13
|
-
"BaseCanonicalizerPlugin",
|
|
14
|
-
"OrganizationCanonicalizer",
|
|
15
|
-
"PersonCanonicalizer",
|
|
16
|
-
"LocationCanonicalizer",
|
|
17
|
-
]
|