corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,477 @@
1
+ """
2
+ EmbeddingTaxonomyLabeler - Classifies statements using embedding similarity.
3
+
4
+ Uses sentence-transformers to embed text and compare to pre-computed label
5
+ embeddings using cosine similarity with sigmoid calibration.
6
+
7
+ This is faster than MNLI but may be less accurate for nuanced classification.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Optional, TypedDict
15
+
16
+ import numpy as np
17
+
18
+
19
+ class TaxonomyEntry(TypedDict):
20
+ """Structure for each taxonomy label entry."""
21
+ description: str
22
+ id: int
23
+ mnli_label: str
24
+ embedding_label: str
25
+
26
+
27
+ from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
28
+ from ...pipeline.context import PipelineContext
29
+ from ...models import (
30
+ PipelineStatement,
31
+ CanonicalEntity,
32
+ StatementLabel,
33
+ )
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Default taxonomy file location (relative to this module)
38
+ DEFAULT_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "data" / "statement_taxonomy.json"
39
+
40
+ # Default categories to use
41
+ DEFAULT_CATEGORIES = [
42
+ "environment",
43
+ "society",
44
+ "governance",
45
+ "animals",
46
+ "industry",
47
+ "human_harm",
48
+ "human_benefit",
49
+ "animal_harm",
50
+ "animal_benefit",
51
+ "environment_harm",
52
+ "environment_benefit",
53
+ ]
54
+
55
+
56
+ class EmbeddingClassifier:
57
+ """
58
+ Embedding-based classifier using cosine similarity.
59
+
60
+ Pre-computes embeddings for all labels and uses dot product
61
+ (on normalized vectors) for fast classification.
62
+ """
63
+
64
+ # Calibration parameters to spread out cosine similarity scores
65
+ SIMILARITY_THRESHOLD = 0.65
66
+ CALIBRATION_STEEPNESS = 25.0
67
+
68
+ def __init__(
69
+ self,
70
+ model_name: str = "all-MiniLM-L6-v2",
71
+ device: Optional[str] = None,
72
+ ):
73
+ """
74
+ Initialize the classifier.
75
+
76
+ Args:
77
+ model_name: sentence-transformers model ID
78
+ device: Device to use ('cuda', 'mps', 'cpu', or None for auto)
79
+ """
80
+ self._model_name = model_name
81
+ self._device = device
82
+ self._model = None
83
+
84
+ # Pre-computed label embeddings: {category: {label: embedding}}
85
+ self._label_embeddings: dict[str, dict[str, np.ndarray]] = {}
86
+
87
+ def _load_model(self):
88
+ """Lazy-load the embedding model."""
89
+ if self._model is not None:
90
+ return
91
+
92
+ try:
93
+ from sentence_transformers import SentenceTransformer
94
+ import torch
95
+
96
+ # Auto-detect device
97
+ device = self._device
98
+ if device is None:
99
+ if torch.cuda.is_available():
100
+ device = "cuda"
101
+ elif torch.backends.mps.is_available():
102
+ device = "mps"
103
+ else:
104
+ device = "cpu"
105
+
106
+ logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
107
+ self._model = SentenceTransformer(self._model_name, device=device)
108
+ logger.debug("Embedding model loaded")
109
+
110
+ except ImportError as e:
111
+ raise ImportError(
112
+ "sentence-transformers is required for embedding classification. "
113
+ "Install with: pip install sentence-transformers"
114
+ ) from e
115
+
116
+ def precompute_label_embeddings(
117
+ self,
118
+ taxonomy: dict[str, dict[str, TaxonomyEntry]],
119
+ categories: Optional[list[str]] = None,
120
+ ) -> None:
121
+ """
122
+ Pre-compute embeddings for all label names.
123
+
124
+ Args:
125
+ taxonomy: Taxonomy dict {category: {label: TaxonomyEntry, ...}, ...}
126
+ categories: Categories to include (default: all)
127
+ """
128
+ self._load_model()
129
+
130
+ start_time = time.perf_counter()
131
+ total_labels = 0
132
+
133
+ categories_to_process = categories or list(taxonomy.keys())
134
+
135
+ for category in categories_to_process:
136
+ if category not in taxonomy:
137
+ continue
138
+
139
+ labels = taxonomy[category]
140
+ label_names = list(labels.keys())
141
+
142
+ if not label_names:
143
+ continue
144
+
145
+ # Batch embed all labels in this category
146
+ embeddings = self._model.encode(label_names, convert_to_numpy=True)
147
+
148
+ # Normalize and store
149
+ self._label_embeddings[category] = {}
150
+ for label_name, embedding in zip(label_names, embeddings):
151
+ norm = np.linalg.norm(embedding)
152
+ normalized = embedding / (norm + 1e-8)
153
+ self._label_embeddings[category][label_name] = normalized.astype(np.float32)
154
+ total_labels += 1
155
+
156
+ elapsed = time.perf_counter() - start_time
157
+ logger.info(
158
+ f"Pre-computed embeddings for {total_labels} labels "
159
+ f"across {len(self._label_embeddings)} categories in {elapsed:.2f}s"
160
+ )
161
+
162
+ def _calibrate_score(self, raw_similarity: float) -> float:
163
+ """
164
+ Apply sigmoid calibration to amplify score differences.
165
+
166
+ Cosine similarities cluster in a narrow range (0.5-0.9).
167
+ This transformation spreads them out for better discrimination.
168
+ """
169
+ # Normalize from [-1, 1] to [0, 1]
170
+ normalized = (raw_similarity + 1) / 2
171
+
172
+ # Apply sigmoid transformation
173
+ exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
174
+ return 1.0 / (1.0 + np.exp(exponent))
175
+
176
+ def classify(
177
+ self,
178
+ text: str,
179
+ categories: Optional[list[str]] = None,
180
+ top_k: int = 5,
181
+ min_score: float = 0.3,
182
+ ) -> list[tuple[str, str, float]]:
183
+ """
184
+ Classify text against pre-computed label embeddings.
185
+
186
+ Args:
187
+ text: Text to classify
188
+ categories: Categories to search (default: all pre-computed)
189
+ top_k: Number of top results to return
190
+ min_score: Minimum calibrated score threshold
191
+
192
+ Returns:
193
+ List of (category, label, score) tuples, sorted by score descending
194
+ """
195
+ self._load_model()
196
+
197
+ if not self._label_embeddings:
198
+ raise RuntimeError("Label embeddings not pre-computed. Call precompute_label_embeddings first.")
199
+
200
+ # Embed input text
201
+ input_embedding = self._model.encode(text, convert_to_numpy=True)
202
+ input_norm = np.linalg.norm(input_embedding)
203
+ input_normalized = input_embedding / (input_norm + 1e-8)
204
+
205
+ # Classify against each category
206
+ categories_to_process = categories or list(self._label_embeddings.keys())
207
+ all_results: list[tuple[str, str, float]] = []
208
+
209
+ for category in categories_to_process:
210
+ if category not in self._label_embeddings:
211
+ continue
212
+
213
+ for label, label_embedding in self._label_embeddings[category].items():
214
+ # Cosine similarity (both vectors are normalized)
215
+ raw_sim = float(np.dot(input_normalized, label_embedding))
216
+ calibrated_score = self._calibrate_score(raw_sim)
217
+
218
+ if calibrated_score >= min_score:
219
+ all_results.append((category, label, calibrated_score))
220
+
221
+ # Sort by score descending and return top-k
222
+ all_results.sort(key=lambda x: x[2], reverse=True)
223
+ return all_results[:top_k]
224
+
225
+ def classify_hierarchical(
226
+ self,
227
+ text: str,
228
+ top_k_categories: int = 3,
229
+ top_k_labels: int = 3,
230
+ min_score: float = 0.3,
231
+ ) -> tuple[str, str, float]:
232
+ """
233
+ Hierarchical classification: find best category, then best label.
234
+
235
+ More efficient for very large taxonomies.
236
+
237
+ Args:
238
+ text: Text to classify
239
+ top_k_categories: Number of top categories to consider
240
+ top_k_labels: Number of labels per category to consider
241
+ min_score: Minimum score threshold
242
+
243
+ Returns:
244
+ Tuple of (category, label, score) for best match
245
+ """
246
+ self._load_model()
247
+
248
+ if not self._label_embeddings:
249
+ raise RuntimeError("Label embeddings not pre-computed.")
250
+
251
+ # Embed input text
252
+ input_embedding = self._model.encode(text, convert_to_numpy=True)
253
+ input_norm = np.linalg.norm(input_embedding)
254
+ input_normalized = input_embedding / (input_norm + 1e-8)
255
+
256
+ # First, compute average similarity to each category
257
+ category_scores: list[tuple[str, float]] = []
258
+ for category, labels in self._label_embeddings.items():
259
+ if not labels:
260
+ continue
261
+
262
+ # Average similarity to all labels in category
263
+ sims = []
264
+ for label_embedding in labels.values():
265
+ sim = float(np.dot(input_normalized, label_embedding))
266
+ sims.append(sim)
267
+
268
+ avg_sim = np.mean(sims)
269
+ category_scores.append((category, avg_sim))
270
+
271
+ # Sort categories by average similarity
272
+ category_scores.sort(key=lambda x: x[1], reverse=True)
273
+
274
+ # Find best label within top categories
275
+ best_result = (None, None, 0.0)
276
+
277
+ for category, _ in category_scores[:top_k_categories]:
278
+ for label, label_embedding in self._label_embeddings[category].items():
279
+ raw_sim = float(np.dot(input_normalized, label_embedding))
280
+ calibrated_score = self._calibrate_score(raw_sim)
281
+
282
+ if calibrated_score > best_result[2]:
283
+ best_result = (category, label, calibrated_score)
284
+
285
+ if best_result[0] and best_result[2] >= min_score:
286
+ return best_result
287
+
288
+ return (None, None, 0.0)
289
+
290
+
291
+ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
292
+ """
293
+ Labeler that classifies statements using embedding similarity.
294
+
295
+ Faster than MNLI but may be less accurate for nuanced classification.
296
+ Good for high-throughput scenarios.
297
+ """
298
+
299
+ def __init__(
300
+ self,
301
+ taxonomy_path: Optional[str | Path] = None,
302
+ categories: Optional[list[str]] = None,
303
+ model_name: str = "all-MiniLM-L6-v2",
304
+ use_hierarchical: bool = True,
305
+ top_k_categories: int = 3,
306
+ min_confidence: float = 0.3,
307
+ ):
308
+ """
309
+ Initialize the embedding taxonomy labeler.
310
+
311
+ Args:
312
+ taxonomy_path: Path to taxonomy JSON file (default: built-in taxonomy)
313
+ categories: List of categories to use (default: all categories)
314
+ model_name: sentence-transformers model ID
315
+ use_hierarchical: Use hierarchical classification for efficiency
316
+ top_k_categories: Number of top categories to consider in hierarchical mode
317
+ min_confidence: Minimum confidence threshold for returning a label
318
+ """
319
+ self._taxonomy_path = Path(taxonomy_path) if taxonomy_path else DEFAULT_TAXONOMY_PATH
320
+ self._categories = categories or DEFAULT_CATEGORIES
321
+ self._model_name = model_name
322
+ self._use_hierarchical = use_hierarchical
323
+ self._top_k_categories = top_k_categories
324
+ self._min_confidence = min_confidence
325
+
326
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
327
+ self._classifier: Optional[EmbeddingClassifier] = None
328
+ self._embeddings_computed = False
329
+
330
+ @property
331
+ def name(self) -> str:
332
+ return "embedding_taxonomy_labeler"
333
+
334
+ @property
335
+ def priority(self) -> int:
336
+ return 45 # Higher priority than MNLI - default taxonomy labeler (faster)
337
+
338
+ @property
339
+ def capabilities(self) -> PluginCapability:
340
+ return PluginCapability.LLM_REQUIRED | PluginCapability.BATCH_PROCESSING
341
+
342
+ @property
343
+ def description(self) -> str:
344
+ return "Classifies statements using embedding similarity (faster than MNLI)"
345
+
346
+ @property
347
+ def label_type(self) -> str:
348
+ return "taxonomy_embedding"
349
+
350
+ @property
351
+ def taxonomy_schema(self) -> TaxonomySchema:
352
+ """Provide taxonomy schema (for documentation/introspection)."""
353
+ taxonomy = self._load_taxonomy()
354
+ filtered = {cat: list(labels.keys()) for cat, labels in taxonomy.items() if cat in self._categories}
355
+ return TaxonomySchema(
356
+ label_type=self.label_type,
357
+ values=filtered,
358
+ description="Statement topic classification using embedding similarity",
359
+ scope="statement",
360
+ )
361
+
362
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
363
+ """Load taxonomy from JSON file."""
364
+ if self._taxonomy is not None:
365
+ return self._taxonomy
366
+
367
+ if not self._taxonomy_path.exists():
368
+ raise FileNotFoundError(f"Taxonomy file not found: {self._taxonomy_path}")
369
+
370
+ with open(self._taxonomy_path) as f:
371
+ self._taxonomy = json.load(f)
372
+
373
+ logger.debug(f"Loaded taxonomy with {len(self._taxonomy)} categories")
374
+ return self._taxonomy
375
+
376
+ def _get_classifier(self) -> EmbeddingClassifier:
377
+ """Get or create the embedding classifier."""
378
+ if self._classifier is None:
379
+ self._classifier = EmbeddingClassifier(model_name=self._model_name)
380
+
381
+ if not self._embeddings_computed:
382
+ taxonomy = self._load_taxonomy()
383
+ self._classifier.precompute_label_embeddings(taxonomy, self._categories)
384
+ self._embeddings_computed = True
385
+
386
+ return self._classifier
387
+
388
+ def label(
389
+ self,
390
+ statement: PipelineStatement,
391
+ subject_canonical: CanonicalEntity,
392
+ object_canonical: CanonicalEntity,
393
+ context: PipelineContext,
394
+ ) -> Optional[StatementLabel]:
395
+ """
396
+ Classify statement using embedding similarity.
397
+
398
+ Args:
399
+ statement: The statement to label
400
+ subject_canonical: Canonicalized subject
401
+ object_canonical: Canonicalized object
402
+ context: Pipeline context
403
+
404
+ Returns:
405
+ StatementLabel with taxonomy classification, or None if below threshold
406
+ """
407
+ # Check for pre-computed classification
408
+ result = context.get_classification(statement.source_text, self.label_type)
409
+ if result:
410
+ label_value, confidence = result
411
+ if confidence >= self._min_confidence:
412
+ return StatementLabel(
413
+ label_type=self.label_type,
414
+ label_value=label_value,
415
+ confidence=confidence,
416
+ labeler=self.name,
417
+ )
418
+ return None
419
+
420
+ # Run embedding classification
421
+ try:
422
+ classifier = self._get_classifier()
423
+ text = statement.source_text
424
+
425
+ if self._use_hierarchical:
426
+ category, label, confidence = classifier.classify_hierarchical(
427
+ text,
428
+ top_k_categories=self._top_k_categories,
429
+ min_score=self._min_confidence,
430
+ )
431
+ if category and label:
432
+ full_label = f"{category}:{label}"
433
+ else:
434
+ return None
435
+ else:
436
+ results = classifier.classify(
437
+ text,
438
+ top_k=1,
439
+ min_score=self._min_confidence,
440
+ )
441
+ if results:
442
+ category, label, confidence = results[0]
443
+ full_label = f"{category}:{label}"
444
+ else:
445
+ return None
446
+
447
+ # Get the numeric ID for reproducibility
448
+ label_id = self._get_label_id(category, label)
449
+
450
+ return StatementLabel(
451
+ label_type=self.label_type,
452
+ label_value=full_label,
453
+ confidence=round(confidence, 4),
454
+ labeler=self.name,
455
+ metadata={"label_id": label_id, "category": category},
456
+ )
457
+
458
+ except Exception as e:
459
+ logger.warning(f"Embedding taxonomy classification failed: {e}")
460
+
461
+ return None
462
+
463
+ def _get_label_id(self, category: str, label: str) -> Optional[int]:
464
+ """Get the numeric ID for a label."""
465
+ taxonomy = self._load_taxonomy()
466
+
467
+ if category in taxonomy:
468
+ entry = taxonomy[category].get(label)
469
+ if entry:
470
+ return entry.get("id")
471
+
472
+ return None
473
+
474
+
475
+ # Allow importing without decorator for testing
476
+ EmbeddingTaxonomyLabelerClass = EmbeddingTaxonomyLabeler
477
+ EmbeddingClassifierClass = EmbeddingClassifier
@@ -0,0 +1,10 @@
1
+ """
2
+ PDF parser plugins for extracting text from PDF files.
3
+
4
+ Built-in parsers:
5
+ - pypdf_parser: Default PDF parser using PyMuPDF with optional OCR
6
+ """
7
+
8
+ from .pypdf import PyPDFParserPlugin
9
+
10
+ __all__ = ["PyPDFParserPlugin"]