corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -11,10 +11,18 @@ import json
11
11
  import logging
12
12
  import time
13
13
  from pathlib import Path
14
- from typing import Optional
14
+ from typing import Optional, TypedDict
15
15
 
16
16
  import numpy as np
17
17
 
18
+
19
+ class TaxonomyEntry(TypedDict):
20
+ """Structure for each taxonomy label entry."""
21
+ description: str
22
+ id: int
23
+ mnli_label: str
24
+ embedding_label: str
25
+
18
26
  from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
19
27
  from ...pipeline.context import PipelineContext
20
28
  from ...pipeline.registry import PluginRegistry
@@ -96,7 +104,7 @@ class EmbeddingClassifier:
96
104
 
97
105
  def precompute_label_embeddings(
98
106
  self,
99
- taxonomy: dict[str, dict[str, int]],
107
+ taxonomy: dict[str, dict[str, TaxonomyEntry]],
100
108
  categories: Optional[list[str]] = None,
101
109
  ) -> None:
102
110
  """Pre-compute embeddings for all label names."""
@@ -137,68 +145,127 @@ class EmbeddingClassifier:
137
145
  exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
138
146
  return 1.0 / (1.0 + np.exp(exponent))
139
147
 
140
- def classify_hierarchical(
148
+ def encode_batch(self, texts: list[str]) -> np.ndarray:
149
+ """
150
+ Encode multiple texts into normalized embeddings in a single batch.
151
+
152
+ Uses caching to avoid re-encoding previously seen texts.
153
+
154
+ Args:
155
+ texts: List of texts to encode
156
+
157
+ Returns:
158
+ 2D numpy array of shape (len(texts), embedding_dim) with normalized embeddings
159
+ """
160
+ self._load_model()
161
+
162
+ # Separate cached from uncached texts
163
+ uncached_indices = []
164
+ uncached_texts = []
165
+ for i, text in enumerate(texts):
166
+ if text not in self._text_embedding_cache:
167
+ uncached_indices.append(i)
168
+ uncached_texts.append(text)
169
+
170
+ # Batch encode uncached texts
171
+ if uncached_texts:
172
+ embeddings = self._model.encode(uncached_texts, convert_to_numpy=True, show_progress_bar=False)
173
+ for i, (text, embedding) in enumerate(zip(uncached_texts, embeddings)):
174
+ norm = np.linalg.norm(embedding)
175
+ normalized = (embedding / (norm + 1e-8)).astype(np.float32)
176
+ self._text_embedding_cache[text] = normalized
177
+
178
+ logger.debug(f"Batch encoded {len(uncached_texts)} texts (cache size: {len(self._text_embedding_cache)})")
179
+
180
+ # Build result array from cache
181
+ result = np.stack([self._text_embedding_cache[text] for text in texts])
182
+ return result
183
+
184
+ def classify_batch(
141
185
  self,
142
- text: str,
186
+ texts: list[str],
143
187
  top_k_categories: int = 3,
144
188
  min_score: float = 0.3,
145
- ) -> list[tuple[str, str, float]]:
146
- """Hierarchical classification: find categories, then all labels above threshold.
147
-
148
- Returns all labels above the threshold, not just the best match.
189
+ ) -> list[list[tuple[str, str, float]]]:
190
+ """
191
+ Classify multiple texts in a single batch for efficiency.
149
192
 
150
193
  Args:
151
- text: Text to classify
152
- top_k_categories: Number of top categories to consider
194
+ texts: List of texts to classify
195
+ top_k_categories: Number of top categories to consider per text
153
196
  min_score: Minimum calibrated score to include in results
154
197
 
155
198
  Returns:
156
- List of (category, label, confidence) tuples above threshold
199
+ List of classification results, one list per input text
157
200
  """
201
+ if not texts:
202
+ return []
203
+
158
204
  self._load_model()
159
205
 
160
206
  if not self._label_embeddings:
161
207
  raise RuntimeError("Label embeddings not pre-computed.")
162
208
 
163
- # Check cache for input text embedding
164
- if text in self._text_embedding_cache:
165
- input_normalized = self._text_embedding_cache[text]
166
- else:
167
- input_embedding = self._model.encode(text, convert_to_numpy=True, show_progress_bar=False)
168
- input_norm = np.linalg.norm(input_embedding)
169
- input_normalized = (input_embedding / (input_norm + 1e-8)).astype(np.float32)
170
- self._text_embedding_cache[text] = input_normalized
171
- logger.debug(f"Cached embedding for text: '{text[:50]}...' (cache size: {len(self._text_embedding_cache)})")
172
-
173
- # Compute average similarity to each category
174
- category_scores: list[tuple[str, float]] = []
175
- for category, labels in self._label_embeddings.items():
176
- if not labels:
177
- continue
209
+ # Batch encode all texts
210
+ input_embeddings = self.encode_batch(texts)
178
211
 
179
- sims = []
180
- for label_embedding in labels.values():
181
- sim = float(np.dot(input_normalized, label_embedding))
182
- sims.append(sim)
212
+ # Prepare label embeddings as matrices for vectorized similarity
213
+ all_results: list[list[tuple[str, str, float]]] = []
183
214
 
184
- avg_sim = np.mean(sims)
185
- category_scores.append((category, avg_sim))
215
+ for input_normalized in input_embeddings:
216
+ # Compute average similarity to each category
217
+ category_scores: list[tuple[str, float]] = []
218
+ for category, labels in self._label_embeddings.items():
219
+ if not labels:
220
+ continue
186
221
 
187
- category_scores.sort(key=lambda x: x[1], reverse=True)
222
+ sims = []
223
+ for label_embedding in labels.values():
224
+ sim = float(np.dot(input_normalized, label_embedding))
225
+ sims.append(sim)
188
226
 
189
- results: list[tuple[str, str, float]] = []
227
+ avg_sim = np.mean(sims)
228
+ category_scores.append((category, avg_sim))
190
229
 
191
- for category, _ in category_scores[:top_k_categories]:
192
- for label, label_embedding in self._label_embeddings[category].items():
193
- raw_sim = float(np.dot(input_normalized, label_embedding))
194
- calibrated_score = self._calibrate_score(raw_sim)
230
+ category_scores.sort(key=lambda x: x[1], reverse=True)
195
231
 
196
- if calibrated_score >= min_score:
197
- results.append((category, label, calibrated_score))
232
+ results: list[tuple[str, str, float]] = []
198
233
 
199
- # Sort by confidence descending
200
- results.sort(key=lambda x: x[2], reverse=True)
201
- return results
234
+ for category, _ in category_scores[:top_k_categories]:
235
+ for label, label_embedding in self._label_embeddings[category].items():
236
+ raw_sim = float(np.dot(input_normalized, label_embedding))
237
+ calibrated_score = self._calibrate_score(raw_sim)
238
+
239
+ if calibrated_score >= min_score:
240
+ results.append((category, label, calibrated_score))
241
+
242
+ # Sort by confidence descending
243
+ results.sort(key=lambda x: x[2], reverse=True)
244
+ all_results.append(results)
245
+
246
+ return all_results
247
+
248
+ def classify_hierarchical(
249
+ self,
250
+ text: str,
251
+ top_k_categories: int = 3,
252
+ min_score: float = 0.3,
253
+ ) -> list[tuple[str, str, float]]:
254
+ """Hierarchical classification: find categories, then all labels above threshold.
255
+
256
+ Returns all labels above the threshold, not just the best match.
257
+
258
+ Args:
259
+ text: Text to classify
260
+ top_k_categories: Number of top categories to consider
261
+ min_score: Minimum calibrated score to include in results
262
+
263
+ Returns:
264
+ List of (category, label, confidence) tuples above threshold
265
+ """
266
+ # Use batch method for single text
267
+ results = self.classify_batch([text], top_k_categories, min_score)
268
+ return results[0] if results else []
202
269
 
203
270
 
204
271
  @PluginRegistry.taxonomy
@@ -223,7 +290,7 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
223
290
  self._top_k_categories = top_k_categories
224
291
  self._min_confidence = min_confidence
225
292
 
226
- self._taxonomy: Optional[dict[str, dict[str, int]]] = None
293
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
227
294
  self._classifier: Optional[EmbeddingClassifier] = None
228
295
  self._embeddings_computed = False
229
296
 
@@ -243,6 +310,16 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
243
310
  def description(self) -> str:
244
311
  return "Classifies statements using embedding similarity (faster than MNLI)"
245
312
 
313
+ @property
314
+ def model_vram_gb(self) -> float:
315
+ """EmbeddingGemma model weights ~1.2GB."""
316
+ return 1.2
317
+
318
+ @property
319
+ def per_item_vram_gb(self) -> float:
320
+ """Each text embedding ~0.05GB (embeddings are small)."""
321
+ return 0.05
322
+
246
323
  @property
247
324
  def taxonomy_name(self) -> str:
248
325
  return "esg_topics_embedding"
@@ -262,7 +339,7 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
262
339
  def supported_categories(self) -> list[str]:
263
340
  return self._categories.copy()
264
341
 
265
- def _load_taxonomy(self) -> dict[str, dict[str, int]]:
342
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
266
343
  if self._taxonomy is not None:
267
344
  return self._taxonomy
268
345
 
@@ -329,9 +406,79 @@ class EmbeddingTaxonomyClassifier(BaseTaxonomyPlugin):
329
406
  def _get_label_id(self, category: str, label: str) -> Optional[int]:
330
407
  taxonomy = self._load_taxonomy()
331
408
  if category in taxonomy:
332
- return taxonomy[category].get(label)
409
+ entry = taxonomy[category].get(label)
410
+ if entry:
411
+ return entry.get("id")
333
412
  return None
334
413
 
414
+ def classify_batch(
415
+ self,
416
+ items: list[tuple[PipelineStatement, CanonicalEntity, CanonicalEntity]],
417
+ context: PipelineContext,
418
+ ) -> list[list[TaxonomyResult]]:
419
+ """
420
+ Classify multiple statements in a single batch for efficiency.
421
+
422
+ Batch encodes all source texts, then classifies each against the taxonomy.
423
+
424
+ Args:
425
+ items: List of (statement, subject_canonical, object_canonical) tuples
426
+ context: Pipeline context
427
+
428
+ Returns:
429
+ List of TaxonomyResult lists, one per input statement
430
+ """
431
+ if not items:
432
+ return []
433
+
434
+ # Extract unique source texts (may have duplicates across statements)
435
+ texts = [stmt.source_text for stmt, _, _ in items]
436
+ unique_texts = list(set(texts))
437
+
438
+ logger.info(f"Batch classifying {len(items)} statements ({len(unique_texts)} unique texts)")
439
+
440
+ try:
441
+ classifier = self._get_classifier()
442
+
443
+ # Batch classify all unique texts
444
+ batch_results = classifier.classify_batch(
445
+ unique_texts,
446
+ top_k_categories=self._top_k_categories,
447
+ min_score=self._min_confidence,
448
+ )
449
+
450
+ # Map unique texts to their classifications
451
+ text_to_results: dict[str, list[tuple[str, str, float]]] = {
452
+ text: results for text, results in zip(unique_texts, batch_results)
453
+ }
454
+
455
+ # Build results for each input statement
456
+ all_results: list[list[TaxonomyResult]] = []
457
+ for stmt, _, _ in items:
458
+ classifications = text_to_results.get(stmt.source_text, [])
459
+
460
+ results: list[TaxonomyResult] = []
461
+ for category, label, confidence in classifications:
462
+ label_id = self._get_label_id(category, label)
463
+
464
+ results.append(TaxonomyResult(
465
+ taxonomy_name=self.taxonomy_name,
466
+ category=category,
467
+ label=label,
468
+ label_id=label_id,
469
+ confidence=round(confidence, 4),
470
+ classifier=self.name,
471
+ ))
472
+
473
+ all_results.append(results)
474
+
475
+ return all_results
476
+
477
+ except Exception as e:
478
+ logger.warning(f"Batch taxonomy classification failed: {e}")
479
+ # Return empty results for all items
480
+ return [[] for _ in items]
481
+
335
482
 
336
483
  # For testing without decorator
337
484
  EmbeddingTaxonomyClassifierClass = EmbeddingTaxonomyClassifier
@@ -8,9 +8,19 @@ where there are too many possible values for simple multi-choice classification.
8
8
  import json
9
9
  import logging
10
10
  from pathlib import Path
11
- from typing import Optional
11
+ from typing import Optional, TypedDict
12
12
 
13
13
  from ..base import BaseTaxonomyPlugin, TaxonomySchema, PluginCapability
14
+
15
+
16
+ class TaxonomyEntry(TypedDict):
17
+ """Structure for each taxonomy label entry."""
18
+ description: str
19
+ id: int
20
+ mnli_label: str
21
+ embedding_label: str
22
+
23
+
14
24
  from ...pipeline.context import PipelineContext
15
25
  from ...pipeline.registry import PluginRegistry
16
26
  from ...models import (
@@ -160,7 +170,7 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
160
170
  self._top_k_categories = top_k_categories
161
171
  self._min_confidence = min_confidence
162
172
 
163
- self._taxonomy: Optional[dict[str, dict[str, int]]] = None
173
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
164
174
  self._classifier: Optional[MNLIClassifier] = None
165
175
 
166
176
  @property
@@ -198,7 +208,7 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
198
208
  def supported_categories(self) -> list[str]:
199
209
  return self._categories.copy()
200
210
 
201
- def _load_taxonomy(self) -> dict[str, dict[str, int]]:
211
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
202
212
  """Load taxonomy from JSON file."""
203
213
  if self._taxonomy is not None:
204
214
  return self._taxonomy
@@ -271,7 +281,9 @@ class MNLITaxonomyClassifier(BaseTaxonomyPlugin):
271
281
  def _get_label_id(self, category: str, label: str) -> Optional[int]:
272
282
  taxonomy = self._load_taxonomy()
273
283
  if category in taxonomy:
274
- return taxonomy[category].get(label)
284
+ entry = taxonomy[category].get(label)
285
+ if entry:
286
+ return entry.get("id")
275
287
  return None
276
288
 
277
289
 
@@ -409,18 +409,18 @@ class BeamScorer:
409
409
  filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
410
410
  logger.debug(f" After confidence filter (>={min_conf}): {len(filtered)} statements")
411
411
 
412
- # # Filter out statements where source_text doesn't support the predicate
413
- # # This catches model hallucinations where predicate doesn't match the evidence
414
- # consistent = [
415
- # s for s in filtered
416
- # if self._source_text_supports_predicate(s)
417
- # ]
418
- # logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
412
+ # Filter out statements where source_text doesn't support the predicate
413
+ # This catches model hallucinations where predicate doesn't match the evidence
414
+ consistent = [
415
+ s for s in filtered
416
+ if self._source_text_supports_predicate(s)
417
+ ]
418
+ logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
419
419
 
420
420
  # Deduplicate - keep highest confidence for each (subject, predicate, object)
421
421
  # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
422
422
  seen: dict[tuple[str, str, str], Statement] = {}
423
- for stmt in all_statements:
423
+ for stmt in consistent:
424
424
  key = (
425
425
  stmt.subject.text.lower(),
426
426
  stmt.predicate.lower(),
@@ -1,55 +0,0 @@
1
- statement_extractor/__init__.py,sha256=Lmgw3jtwrfu09mXSfNFCB5AN0J6tsEQ2uOrrQciMrtI,3215
2
- statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
- statement_extractor/cli.py,sha256=iqsqvLAN0FMRoE4KskEoW-4DE5_7Tll8xeHA1t04KJg,25028
4
- statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
5
- statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
- statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
- statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
8
- statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
- statement_extractor/scoring.py,sha256=s_8nhavBNzPPFmGf2FyBummH4tgP7YGpXoMhl2Jh3Xw,16650
10
- statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
- statement_extractor/data/statement_taxonomy.json,sha256=XhCeVBC4aQB-7NR40Niu4yN2BmL0c2Gd-RKkUpsYK24,37981
12
- statement_extractor/models/__init__.py,sha256=gjTu450FPe9dvhIVQXqBwF8u0hgSnPORGXzxmSEuCnM,2564
13
- statement_extractor/models/canonical.py,sha256=ld6z6RtK03iOs_aUk8Rftcm0pUoaFpLUfyfbKI26N_o,4354
14
- statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
15
- statement_extractor/models/labels.py,sha256=e-mFDuzb42oJ69gLZTWCdg5_MNqRftQ2La5x8y9Cv-Y,6236
16
- statement_extractor/models/qualifiers.py,sha256=YkvyWh2p1fK5iMRDC2Dq1r-XJOmJ1rvWFTFUIkQ9zcc,3495
17
- statement_extractor/models/statement.py,sha256=cOgabA7IJxHYjlH5AksJRNf2Rv5VScMPqZdfjQyXRN0,2733
18
- statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
19
- statement_extractor/pipeline/config.py,sha256=rxZN27OWp05F-NaatwrYkjp56zbzHZ0hMtNU1mvBxgw,4130
20
- statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
21
- statement_extractor/pipeline/orchestrator.py,sha256=oHegnsDzXj87q8iAoi-QZj2ZyB1rX5qmg57BdIjvKo0,17617
22
- statement_extractor/pipeline/registry.py,sha256=qj5M5tMm9GmNCguy8dWBXMT8XmhemiZjJMktZsRlevw,11415
23
- statement_extractor/plugins/__init__.py,sha256=8k3lQGQNQSMUzxCmk4nAH8dIc1DqEnMyiqHlZZv81q0,1099
24
- statement_extractor/plugins/base.py,sha256=GZ4WT5S2mH3C_uN6nyBz-nGlAn_Z2o2A51FSRu6gCEo,12797
25
- statement_extractor/plugins/canonicalizers/__init__.py,sha256=LDb9NodyuLSoLzrLnNzMeviK79GHnyaLGU0J_02BBgM,421
26
- statement_extractor/plugins/canonicalizers/base.py,sha256=dbreQuEPB48eBJmah7hpl67azVU4QLhbvSrjXr0vT88,195
27
- statement_extractor/plugins/canonicalizers/location.py,sha256=Rz5SCM4bb0p0gsnHPzsQJv-RN59yoj9Z1NmF8yLQNv0,6590
28
- statement_extractor/plugins/canonicalizers/organization.py,sha256=L-mhdctkRXuu84RsNHp80M_tDIiMumYaHAG6WfxpH4c,7482
29
- statement_extractor/plugins/canonicalizers/person.py,sha256=Nw8FuJOBmg-cTaOTd2BJ1TZtydprfzIKL25wJa_VJek,6944
30
- statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
31
- statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
32
- statement_extractor/plugins/extractors/gliner2.py,sha256=rgfY8l9v8EWCxfB3g6hLnmLCIekTBkfWMG8dgSAZu-E,21627
33
- statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
34
- statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
35
- statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
36
- statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
37
- statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
38
- statement_extractor/plugins/labelers/taxonomy.py,sha256=jQp5emgWf6XgmOx7arh-owF_-TjVxiPKSJ2OGkTPbBs,12427
39
- statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=grvC_R_sg05hR6l0DgaELy2wmf6OkbvV1pRuNU0FVk4,16027
40
- statement_extractor/plugins/qualifiers/__init__.py,sha256=kefjGunlVDKLy2NXmtr5ZXyYi-swyQdPLkB-tHV_0vk,495
41
- statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
42
- statement_extractor/plugins/qualifiers/companies_house.py,sha256=_6ExJCjD0V4eZNYXtfBY99obqLpRaSv-G-V7N6R1wLg,5376
43
- statement_extractor/plugins/qualifiers/gleif.py,sha256=WZqcNT_Yq4yVe4rdkWO59C9yZ4geV2ZTDk9wxLlOeTg,5645
44
- statement_extractor/plugins/qualifiers/person.py,sha256=si_9CLjHsH9jYFugej4t0HMnsivclh-Yi70U6NglfIU,7101
45
- statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=3XDbizlR9YQgLrC7p-owV8Td-3TYaJlMb4B7saha3vw,6288
46
- statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
47
- statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
48
- statement_extractor/plugins/splitters/t5_gemma.py,sha256=8joOzlMKXhSyJaq5c3F8t-gdPcZEDiVAzNcMlgJAqsE,6733
49
- statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
50
- statement_extractor/plugins/taxonomy/embedding.py,sha256=QW1RR07JoE8Ah97gDZ_w_ATEe6-z2t2nl1zeTDAgFjM,11347
51
- statement_extractor/plugins/taxonomy/mnli.py,sha256=IzLjHXUFgVAgEvYI5EzOBs19UxvpcbJa8HjqI__tYII,8905
52
- corp_extractor-0.5.0.dist-info/METADATA,sha256=H4Z8ExZFdbknpHg-EZ1P9B137hCPwKXBezHSF7X9EOE,21567
53
- corp_extractor-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
54
- corp_extractor-0.5.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
55
- corp_extractor-0.5.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- """
2
- Canonicalizer plugins for Stage 4 (Canonicalization).
3
-
4
- Resolves entities to their canonical forms.
5
- """
6
-
7
- from .base import BaseCanonicalizerPlugin
8
- from .organization import OrganizationCanonicalizer
9
- from .person import PersonCanonicalizer
10
- from .location import LocationCanonicalizer
11
-
12
- __all__ = [
13
- "BaseCanonicalizerPlugin",
14
- "OrganizationCanonicalizer",
15
- "PersonCanonicalizer",
16
- "LocationCanonicalizer",
17
- ]
@@ -1,9 +0,0 @@
1
- """
2
- Base class for canonicalizer plugins.
3
-
4
- Re-exports BaseCanonicalizerPlugin from the main plugins module.
5
- """
6
-
7
- from ..base import BaseCanonicalizerPlugin
8
-
9
- __all__ = ["BaseCanonicalizerPlugin"]