corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,420 @@
1
+ """
2
+ EmbeddingCompanyQualifier - Qualifies ORG entities using embedding similarity.
3
+
4
+ Uses a local embedding database to:
5
+ 1. Find similar company names by embedding
6
+ 2. Use LLM to confirm the best match
7
+ 3. Return CanonicalEntity with FQN and qualifiers
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional
12
+
13
+ from ..base import BaseQualifierPlugin, PluginCapability
14
+ from ...pipeline.context import PipelineContext
15
+ from ...pipeline.registry import PluginRegistry
16
+ from ...models import (
17
+ ExtractedEntity,
18
+ EntityQualifiers,
19
+ EntityType,
20
+ QualifiedEntity,
21
+ CanonicalEntity,
22
+ CanonicalMatch,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # LLM prompt template for company matching confirmation
29
+ COMPANY_MATCH_PROMPT = """You are matching a company name extracted from text to a database of known companies.
30
+
31
+ Extracted name: "{query_name}"
32
+ {context_line}
33
+ Candidate matches (sorted by similarity):
34
+ {candidates}
35
+
36
+ Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
37
+
38
+ Rules:
39
+ - The match should refer to the same legal entity
40
+ - Minor spelling differences or abbreviations are OK (e.g., "Apple" matches "Apple Inc.")
41
+ - Different companies with similar names should NOT match
42
+ - Consider the REGION when matching - prefer companies from regions mentioned in or relevant to the context
43
+ - If the extracted name is too generic or ambiguous, respond "NONE"
44
+
45
+ Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
46
+ """
47
+
48
+
49
+ @PluginRegistry.qualifier
50
+ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
51
+ """
52
+ Qualifier plugin for ORG entities using embedding similarity.
53
+
54
+ Uses a pre-built embedding database to find and confirm company matches.
55
+ This runs before API-based qualifiers (GLEIF, Companies House, SEC Edgar)
56
+ and provides faster, offline matching when the database is available.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ db_path: Optional[str] = None,
62
+ top_k: int = 20,
63
+ min_similarity: float = 0.5,
64
+ use_llm_confirmation: bool = True,
65
+ auto_download_db: bool = True,
66
+ ):
67
+ """
68
+ Initialize the embedding company qualifier.
69
+
70
+ Args:
71
+ db_path: Path to company database (auto-detects if None)
72
+ top_k: Number of candidates to retrieve
73
+ min_similarity: Minimum similarity threshold
74
+ use_llm_confirmation: Whether to use LLM for match confirmation
75
+ auto_download_db: Whether to auto-download database from HuggingFace
76
+ """
77
+ self._db_path = db_path
78
+ self._top_k = top_k
79
+ self._min_similarity = min_similarity
80
+ self._use_llm_confirmation = use_llm_confirmation
81
+ self._auto_download_db = auto_download_db
82
+
83
+ # Lazy-loaded components
84
+ self._database = None
85
+ self._embedder = None
86
+ self._llm = None
87
+ self._cache: dict[str, Optional[CanonicalEntity]] = {}
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ return "embedding_company_qualifier"
92
+
93
+ @property
94
+ def priority(self) -> int:
95
+ return 5 # Runs before API-based qualifiers (GLEIF=10, CH=20, SEC=30)
96
+
97
+ @property
98
+ def capabilities(self) -> PluginCapability:
99
+ caps = PluginCapability.CACHING | PluginCapability.BATCH_PROCESSING
100
+ if self._use_llm_confirmation:
101
+ caps |= PluginCapability.LLM_REQUIRED
102
+ return caps
103
+
104
+ @property
105
+ def description(self) -> str:
106
+ return "Qualifies ORG entities using embedding similarity search with optional LLM confirmation"
107
+
108
+ @property
109
+ def supported_entity_types(self) -> set[EntityType]:
110
+ return {EntityType.ORG}
111
+
112
+ @property
113
+ def supported_identifier_types(self) -> list[str]:
114
+ return ["lei", "sec_cik", "ch_number"]
115
+
116
+ @property
117
+ def provided_identifier_types(self) -> list[str]:
118
+ return ["lei", "sec_cik", "ch_number", "canonical_id"]
119
+
120
+ def _get_database(self):
121
+ """Get or initialize the company database."""
122
+ if self._database is not None:
123
+ return self._database
124
+
125
+ from ...database.store import get_database
126
+ from ...database.hub import get_database_path
127
+
128
+ # Find database path
129
+ db_path = self._db_path
130
+ if db_path is None:
131
+ db_path = get_database_path(auto_download=self._auto_download_db)
132
+
133
+ if db_path is None:
134
+ logger.warning("Company database not available. Skipping embedding qualification.")
135
+ return None
136
+
137
+ # Use singleton to ensure index is only loaded once
138
+ self._database = get_database(db_path=db_path)
139
+ logger.info(f"Loaded company database from {db_path}")
140
+ return self._database
141
+
142
+ def _get_embedder(self):
143
+ """Get or initialize the embedder."""
144
+ if self._embedder is not None:
145
+ return self._embedder
146
+
147
+ from ...database import CompanyEmbedder
148
+ self._embedder = CompanyEmbedder()
149
+ return self._embedder
150
+
151
+ def _get_llm(self):
152
+ """Get or initialize the LLM for confirmation."""
153
+ if self._llm is not None:
154
+ return self._llm
155
+
156
+ if not self._use_llm_confirmation:
157
+ return None
158
+
159
+ try:
160
+ from ...llm import get_llm
161
+ self._llm = get_llm()
162
+ return self._llm
163
+ except Exception as e:
164
+ logger.warning(f"LLM not available for confirmation: {e}")
165
+ return None
166
+
167
+ def qualify(
168
+ self,
169
+ entity: ExtractedEntity,
170
+ context: PipelineContext,
171
+ ) -> Optional[CanonicalEntity]:
172
+ """
173
+ Qualify an ORG entity using embedding similarity.
174
+
175
+ Args:
176
+ entity: The ORG entity to qualify
177
+ context: Pipeline context
178
+
179
+ Returns:
180
+ CanonicalEntity with qualifiers, FQN, and canonical match, or None if no match
181
+ """
182
+ if entity.type != EntityType.ORG:
183
+ return None
184
+
185
+ # Check cache
186
+ cache_key = entity.text.lower().strip()
187
+ if cache_key in self._cache:
188
+ return self._cache[cache_key]
189
+
190
+ # Get database
191
+ database = self._get_database()
192
+ if database is None:
193
+ return None
194
+
195
+ # Get embedder
196
+ embedder = self._get_embedder()
197
+
198
+ # Embed query name
199
+ logger.debug(f" Embedding query: '{entity.text}'")
200
+ query_embedding = embedder.embed(entity.text)
201
+
202
+ # Search for similar companies using hybrid text + vector search
203
+ logger.debug(f" Searching database for similar companies...")
204
+ results = database.search(
205
+ query_embedding,
206
+ top_k=self._top_k,
207
+ query_text=entity.text, # Enable text-based pre-filtering
208
+ )
209
+
210
+ # Filter by minimum similarity
211
+ results = [(r, s) for r, s in results if s >= self._min_similarity]
212
+
213
+ if not results:
214
+ logger.debug(f" No matches found above threshold {self._min_similarity}")
215
+ self._cache[cache_key] = None
216
+ return None
217
+
218
+ # Log all candidates
219
+ logger.info(f" Found {len(results)} candidates for '{entity.text}':")
220
+ for i, (record, sim) in enumerate(results[:10], 1):
221
+ region_str = f" [{record.region}]" if record.region else ""
222
+ logger.info(f" {i}. {record.name}{region_str} (sim={sim:.3f}, source={record.source})")
223
+
224
+ # Get best match (optionally with LLM confirmation)
225
+ logger.info(f" Selecting best match (LLM={self._use_llm_confirmation})...")
226
+ best_match = self._select_best_match(entity.text, results, context)
227
+
228
+ if best_match is None:
229
+ logger.info(f" No confident match for '{entity.text}'")
230
+ self._cache[cache_key] = None
231
+ return None
232
+
233
+ record, similarity = best_match
234
+ logger.info(f" Matched: '{record.name}' (source={record.source}, similarity={similarity:.3f})")
235
+
236
+ # Build CanonicalEntity from matched record
237
+ canonical = self._build_canonical_entity(entity, record, similarity)
238
+
239
+ self._cache[cache_key] = canonical
240
+ return canonical
241
+
242
+ def _select_best_match(
243
+ self,
244
+ query_name: str,
245
+ candidates: list[tuple],
246
+ context: "PipelineContext",
247
+ ) -> Optional[tuple]:
248
+ """
249
+ Select the best match from candidates.
250
+
251
+ Uses LLM if available and configured, otherwise returns top match.
252
+ """
253
+ if not candidates:
254
+ return None
255
+
256
+ # If only one strong match, use it directly
257
+ if len(candidates) == 1 and candidates[0][1] >= 0.9:
258
+ logger.info(f" Single strong match: '{candidates[0][0].name}' (sim={candidates[0][1]:.3f})")
259
+ return candidates[0]
260
+
261
+ # Try LLM confirmation
262
+ llm = self._get_llm()
263
+ if llm is not None:
264
+ try:
265
+ return self._llm_select_match(query_name, candidates, context)
266
+ except Exception as e:
267
+ logger.warning(f" LLM confirmation failed: {e}")
268
+
269
+ # Fallback: use top match if similarity is high enough
270
+ top_record, top_similarity = candidates[0]
271
+ if top_similarity >= 0.85:
272
+ logger.info(f" No LLM, using top match: '{top_record.name}' (sim={top_similarity:.3f})")
273
+ return candidates[0]
274
+
275
+ logger.info(f" No confident match for '{query_name}' (top sim={top_similarity:.3f} < 0.85)")
276
+ return None
277
+
278
+ def _llm_select_match(
279
+ self,
280
+ query_name: str,
281
+ candidates: list[tuple],
282
+ context: "PipelineContext",
283
+ ) -> Optional[tuple]:
284
+ """Use LLM to select the best match."""
285
+ # Format candidates for prompt with region info
286
+ candidate_lines = []
287
+ for i, (record, similarity) in enumerate(candidates[:10], 1): # Limit to top 10
288
+ region_str = f", region: {record.region}" if record.region else ""
289
+ candidate_lines.append(
290
+ f"{i}. {record.name} (source: {record.source}{region_str}, similarity: {similarity:.3f})"
291
+ )
292
+
293
+ # Build context line from source text if available
294
+ context_line = ""
295
+ if context.source_text:
296
+ # Truncate source text for prompt
297
+ source_preview = context.source_text[:500] + "..." if len(context.source_text) > 500 else context.source_text
298
+ context_line = f"Source text context: \"{source_preview}\"\n"
299
+
300
+ prompt = COMPANY_MATCH_PROMPT.format(
301
+ query_name=query_name,
302
+ context_line=context_line,
303
+ candidates="\n".join(candidate_lines),
304
+ )
305
+
306
+ # Get LLM response
307
+ response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
308
+ response = response.strip()
309
+
310
+ logger.info(f" LLM response for '{query_name}': {response}")
311
+
312
+ # Parse response
313
+ if response.upper() == "NONE":
314
+ logger.info(f" LLM chose: NONE (no match)")
315
+ return None
316
+
317
+ try:
318
+ idx = int(response) - 1
319
+ if 0 <= idx < len(candidates):
320
+ chosen = candidates[idx]
321
+ logger.info(f" LLM chose: #{idx + 1} '{chosen[0].name}' (sim={chosen[1]:.3f})")
322
+ return chosen
323
+ except ValueError:
324
+ logger.warning(f" LLM response '{response}' could not be parsed as number")
325
+
326
+ # Fallback to top match if LLM response is unclear
327
+ if candidates[0][1] >= 0.8:
328
+ logger.info(f" Fallback to top match: '{candidates[0][0].name}' (sim={candidates[0][1]:.3f})")
329
+ return candidates[0]
330
+
331
+ logger.info(f" No confident match (top sim={candidates[0][1]:.3f} < 0.8)")
332
+ return None
333
+
334
+ def _build_canonical_entity(
335
+ self,
336
+ entity: ExtractedEntity,
337
+ record,
338
+ similarity: float,
339
+ ) -> CanonicalEntity:
340
+ """Build CanonicalEntity from a matched company record."""
341
+ # Map source names to identifier prefixes
342
+ source = record.source
343
+ source_id = record.source_id
344
+ source_prefix_map = {
345
+ "gleif": "LEI",
346
+ "sec_edgar": "SEC-CIK",
347
+ "companies_house": "UK-CH",
348
+ "wikidata": "WIKIDATA",
349
+ }
350
+ source_prefix = source_prefix_map.get(source, source.upper())
351
+
352
+ # Build identifiers dict
353
+ identifiers = {
354
+ "source": source_prefix,
355
+ "source_id": source_id,
356
+ "canonical_id": f"{source_prefix}:{source_id}",
357
+ }
358
+
359
+ # Add source-specific identifiers for compatibility
360
+ if source == "gleif":
361
+ identifiers["lei"] = source_id
362
+ elif source == "sec_edgar":
363
+ identifiers["sec_cik"] = source_id
364
+ if record.record.get("ticker"):
365
+ identifiers["ticker"] = record.record["ticker"]
366
+ elif source == "companies_house":
367
+ identifiers["ch_number"] = source_id
368
+
369
+ # Extract location info from record
370
+ record_data = record.record
371
+ jurisdiction = record_data.get("jurisdiction")
372
+ country = record_data.get("country")
373
+ city = record_data.get("city")
374
+ region = record.region # From CompanyRecord
375
+
376
+ # Build qualifiers
377
+ qualifiers = EntityQualifiers(
378
+ legal_name=record.name,
379
+ region=region,
380
+ jurisdiction=jurisdiction,
381
+ country=country,
382
+ city=city,
383
+ identifiers=identifiers,
384
+ )
385
+
386
+ # Create QualifiedEntity
387
+ qualified = QualifiedEntity(
388
+ entity_ref=entity.entity_ref,
389
+ original_text=entity.text,
390
+ entity_type=entity.type,
391
+ qualifiers=qualifiers,
392
+ qualification_sources=[self.name],
393
+ )
394
+
395
+ # Build FQN: "LEGAL_NAME (SOURCE,REGION)"
396
+ fqn_parts = [source_prefix]
397
+ if region:
398
+ fqn_parts.append(region)
399
+ fqn = f"{record.name} ({','.join(fqn_parts)})"
400
+
401
+ # Create canonical match (clamp confidence to [0, 1] for float precision)
402
+ clamped_confidence = min(max(similarity, 0.0), 1.0)
403
+ canonical_match = CanonicalMatch(
404
+ canonical_id=f"{source_prefix}:{source_id}",
405
+ canonical_name=record.name,
406
+ match_method="embedding",
407
+ match_confidence=clamped_confidence,
408
+ match_details={"source": source, "similarity": similarity},
409
+ )
410
+
411
+ return CanonicalEntity(
412
+ entity_ref=entity.entity_ref,
413
+ qualified_entity=qualified,
414
+ canonical_match=canonical_match,
415
+ fqn=fqn,
416
+ )
417
+
418
+
419
+ # For testing without decorator
420
+ EmbeddingCompanyQualifierClass = EmbeddingCompanyQualifier
@@ -1,18 +1,21 @@
1
1
  """
2
2
  GLEIFQualifierPlugin - Qualifies ORG entities with LEI and related data.
3
3
 
4
+ DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
5
+ embedding database with pre-loaded GLEIF data for faster, offline matching.
6
+
4
7
  Uses the GLEIF (Global Legal Entity Identifier Foundation) API to:
5
8
  - Look up LEI by organization name
6
9
  - Retrieve legal name, jurisdiction, parent company info
7
10
  """
8
11
 
9
12
  import logging
13
+ import warnings
10
14
  from typing import Optional
11
15
  from urllib.parse import quote
12
16
 
13
17
  from ..base import BaseQualifierPlugin, PluginCapability
14
18
  from ...pipeline.context import PipelineContext
15
- from ...pipeline.registry import PluginRegistry
16
19
  from ...models import ExtractedEntity, EntityQualifiers, EntityType
17
20
 
18
21
  logger = logging.getLogger(__name__)
@@ -21,11 +24,12 @@ logger = logging.getLogger(__name__)
21
24
  GLEIF_API_BASE = "https://api.gleif.org/api/v1"
22
25
 
23
26
 
24
- @PluginRegistry.qualifier
27
+ # DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
25
28
  class GLEIFQualifierPlugin(BaseQualifierPlugin):
26
29
  """
27
- Qualifier plugin for ORG entities using GLEIF API.
30
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
28
31
 
32
+ Qualifier plugin for ORG entities using GLEIF API.
29
33
  Looks up Legal Entity Identifiers (LEI) and related corporate data.
30
34
  """
31
35
 
@@ -37,10 +41,17 @@ class GLEIFQualifierPlugin(BaseQualifierPlugin):
37
41
  """
38
42
  Initialize the GLEIF qualifier.
39
43
 
44
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
45
+
40
46
  Args:
41
47
  timeout: API request timeout in seconds
42
48
  cache_results: Whether to cache API results
43
49
  """
50
+ warnings.warn(
51
+ "GLEIFQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
52
+ DeprecationWarning,
53
+ stacklevel=2,
54
+ )
44
55
  self._timeout = timeout
45
56
  self._cache_results = cache_results
46
57
  self._cache: dict[str, Optional[dict]] = {}