corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,420 @@
1
+ """
2
+ EmbeddingCompanyQualifier - Qualifies ORG entities using embedding similarity.
3
+
4
+ Uses a local embedding database to:
5
+ 1. Find similar company names by embedding
6
+ 2. Use LLM to confirm the best match
7
+ 3. Return CanonicalEntity with FQN and qualifiers
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional
12
+
13
+ from ..base import BaseQualifierPlugin, PluginCapability
14
+ from ...pipeline.context import PipelineContext
15
+ from ...pipeline.registry import PluginRegistry
16
+ from ...models import (
17
+ ExtractedEntity,
18
+ EntityQualifiers,
19
+ EntityType,
20
+ QualifiedEntity,
21
+ CanonicalEntity,
22
+ CanonicalMatch,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # LLM prompt template for company matching confirmation
29
+ COMPANY_MATCH_PROMPT = """You are matching a company name extracted from text to a database of known companies.
30
+
31
+ Extracted name: "{query_name}"
32
+ {context_line}
33
+ Candidate matches (sorted by similarity):
34
+ {candidates}
35
+
36
+ Task: Select the BEST match, or respond "NONE" if no candidate is a good match.
37
+
38
+ Rules:
39
+ - The match should refer to the same legal entity
40
+ - Minor spelling differences or abbreviations are OK (e.g., "Apple" matches "Apple Inc.")
41
+ - Different companies with similar names should NOT match
42
+ - Consider the REGION when matching - prefer companies from regions mentioned in or relevant to the context
43
+ - If the extracted name is too generic or ambiguous, respond "NONE"
44
+
45
+ Respond with ONLY the number of the best match (1, 2, 3, etc.) or "NONE".
46
+ """
47
+
48
+
49
+ @PluginRegistry.qualifier
50
+ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
51
+ """
52
+ Qualifier plugin for ORG entities using embedding similarity.
53
+
54
+ Uses a pre-built embedding database to find and confirm company matches.
55
+ This runs before API-based qualifiers (GLEIF, Companies House, SEC Edgar)
56
+ and provides faster, offline matching when the database is available.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ db_path: Optional[str] = None,
62
+ top_k: int = 20,
63
+ min_similarity: float = 0.5,
64
+ use_llm_confirmation: bool = True,
65
+ auto_download_db: bool = True,
66
+ ):
67
+ """
68
+ Initialize the embedding company qualifier.
69
+
70
+ Args:
71
+ db_path: Path to company database (auto-detects if None)
72
+ top_k: Number of candidates to retrieve
73
+ min_similarity: Minimum similarity threshold
74
+ use_llm_confirmation: Whether to use LLM for match confirmation
75
+ auto_download_db: Whether to auto-download database from HuggingFace
76
+ """
77
+ self._db_path = db_path
78
+ self._top_k = top_k
79
+ self._min_similarity = min_similarity
80
+ self._use_llm_confirmation = use_llm_confirmation
81
+ self._auto_download_db = auto_download_db
82
+
83
+ # Lazy-loaded components
84
+ self._database = None
85
+ self._embedder = None
86
+ self._llm = None
87
+ self._cache: dict[str, Optional[CanonicalEntity]] = {}
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ return "embedding_company_qualifier"
92
+
93
+ @property
94
+ def priority(self) -> int:
95
+ return 5 # Runs before API-based qualifiers (GLEIF=10, CH=20, SEC=30)
96
+
97
+ @property
98
+ def capabilities(self) -> PluginCapability:
99
+ caps = PluginCapability.CACHING | PluginCapability.BATCH_PROCESSING
100
+ if self._use_llm_confirmation:
101
+ caps |= PluginCapability.LLM_REQUIRED
102
+ return caps
103
+
104
+ @property
105
+ def description(self) -> str:
106
+ return "Qualifies ORG entities using embedding similarity search with optional LLM confirmation"
107
+
108
+ @property
109
+ def supported_entity_types(self) -> set[EntityType]:
110
+ return {EntityType.ORG}
111
+
112
+ @property
113
+ def supported_identifier_types(self) -> list[str]:
114
+ return ["lei", "sec_cik", "ch_number"]
115
+
116
+ @property
117
+ def provided_identifier_types(self) -> list[str]:
118
+ return ["lei", "sec_cik", "ch_number", "canonical_id"]
119
+
120
+ def _get_database(self):
121
+ """Get or initialize the company database."""
122
+ if self._database is not None:
123
+ return self._database
124
+
125
+ from ...database.store import get_database
126
+ from ...database.hub import get_database_path
127
+
128
+ # Find database path
129
+ db_path = self._db_path
130
+ if db_path is None:
131
+ db_path = get_database_path(auto_download=self._auto_download_db)
132
+
133
+ if db_path is None:
134
+ logger.warning("Company database not available. Skipping embedding qualification.")
135
+ return None
136
+
137
+ # Use singleton to ensure index is only loaded once
138
+ self._database = get_database(db_path=db_path)
139
+ logger.info(f"Loaded company database from {db_path}")
140
+ return self._database
141
+
142
+ def _get_embedder(self):
143
+ """Get or initialize the embedder."""
144
+ if self._embedder is not None:
145
+ return self._embedder
146
+
147
+ from ...database import CompanyEmbedder
148
+ self._embedder = CompanyEmbedder()
149
+ return self._embedder
150
+
151
+ def _get_llm(self):
152
+ """Get or initialize the LLM for confirmation."""
153
+ if self._llm is not None:
154
+ return self._llm
155
+
156
+ if not self._use_llm_confirmation:
157
+ return None
158
+
159
+ try:
160
+ from ...llm import get_llm
161
+ self._llm = get_llm()
162
+ return self._llm
163
+ except Exception as e:
164
+ logger.warning(f"LLM not available for confirmation: {e}")
165
+ return None
166
+
167
+ def qualify(
168
+ self,
169
+ entity: ExtractedEntity,
170
+ context: PipelineContext,
171
+ ) -> Optional[CanonicalEntity]:
172
+ """
173
+ Qualify an ORG entity using embedding similarity.
174
+
175
+ Args:
176
+ entity: The ORG entity to qualify
177
+ context: Pipeline context
178
+
179
+ Returns:
180
+ CanonicalEntity with qualifiers, FQN, and canonical match, or None if no match
181
+ """
182
+ if entity.type != EntityType.ORG:
183
+ return None
184
+
185
+ # Check cache
186
+ cache_key = entity.text.lower().strip()
187
+ if cache_key in self._cache:
188
+ return self._cache[cache_key]
189
+
190
+ # Get database
191
+ database = self._get_database()
192
+ if database is None:
193
+ return None
194
+
195
+ # Get embedder
196
+ embedder = self._get_embedder()
197
+
198
+ # Embed query name
199
+ logger.debug(f" Embedding query: '{entity.text}'")
200
+ query_embedding = embedder.embed(entity.text)
201
+
202
+ # Search for similar companies using hybrid text + vector search
203
+ logger.debug(f" Searching database for similar companies...")
204
+ results = database.search(
205
+ query_embedding,
206
+ top_k=self._top_k,
207
+ query_text=entity.text, # Enable text-based pre-filtering
208
+ )
209
+
210
+ # Filter by minimum similarity
211
+ results = [(r, s) for r, s in results if s >= self._min_similarity]
212
+
213
+ if not results:
214
+ logger.debug(f" No matches found above threshold {self._min_similarity}")
215
+ self._cache[cache_key] = None
216
+ return None
217
+
218
+ # Log all candidates
219
+ logger.info(f" Found {len(results)} candidates for '{entity.text}':")
220
+ for i, (record, sim) in enumerate(results[:10], 1):
221
+ region_str = f" [{record.region}]" if record.region else ""
222
+ logger.info(f" {i}. {record.name}{region_str} (sim={sim:.3f}, source={record.source})")
223
+
224
+ # Get best match (optionally with LLM confirmation)
225
+ logger.info(f" Selecting best match (LLM={self._use_llm_confirmation})...")
226
+ best_match = self._select_best_match(entity.text, results, context)
227
+
228
+ if best_match is None:
229
+ logger.info(f" No confident match for '{entity.text}'")
230
+ self._cache[cache_key] = None
231
+ return None
232
+
233
+ record, similarity = best_match
234
+ logger.info(f" Matched: '{record.name}' (source={record.source}, similarity={similarity:.3f})")
235
+
236
+ # Build CanonicalEntity from matched record
237
+ canonical = self._build_canonical_entity(entity, record, similarity)
238
+
239
+ self._cache[cache_key] = canonical
240
+ return canonical
241
+
242
+ def _select_best_match(
243
+ self,
244
+ query_name: str,
245
+ candidates: list[tuple],
246
+ context: "PipelineContext",
247
+ ) -> Optional[tuple]:
248
+ """
249
+ Select the best match from candidates.
250
+
251
+ Uses LLM if available and configured, otherwise returns top match.
252
+ """
253
+ if not candidates:
254
+ return None
255
+
256
+ # If only one strong match, use it directly
257
+ if len(candidates) == 1 and candidates[0][1] >= 0.9:
258
+ logger.info(f" Single strong match: '{candidates[0][0].name}' (sim={candidates[0][1]:.3f})")
259
+ return candidates[0]
260
+
261
+ # Try LLM confirmation
262
+ llm = self._get_llm()
263
+ if llm is not None:
264
+ try:
265
+ return self._llm_select_match(query_name, candidates, context)
266
+ except Exception as e:
267
+ logger.warning(f" LLM confirmation failed: {e}")
268
+
269
+ # Fallback: use top match if similarity is high enough
270
+ top_record, top_similarity = candidates[0]
271
+ if top_similarity >= 0.85:
272
+ logger.info(f" No LLM, using top match: '{top_record.name}' (sim={top_similarity:.3f})")
273
+ return candidates[0]
274
+
275
+ logger.info(f" No confident match for '{query_name}' (top sim={top_similarity:.3f} < 0.85)")
276
+ return None
277
+
278
+ def _llm_select_match(
279
+ self,
280
+ query_name: str,
281
+ candidates: list[tuple],
282
+ context: "PipelineContext",
283
+ ) -> Optional[tuple]:
284
+ """Use LLM to select the best match."""
285
+ # Format candidates for prompt with region info
286
+ candidate_lines = []
287
+ for i, (record, similarity) in enumerate(candidates[:10], 1): # Limit to top 10
288
+ region_str = f", region: {record.region}" if record.region else ""
289
+ candidate_lines.append(
290
+ f"{i}. {record.name} (source: {record.source}{region_str}, similarity: {similarity:.3f})"
291
+ )
292
+
293
+ # Build context line from source text if available
294
+ context_line = ""
295
+ if context.source_text:
296
+ # Truncate source text for prompt
297
+ source_preview = context.source_text[:500] + "..." if len(context.source_text) > 500 else context.source_text
298
+ context_line = f"Source text context: \"{source_preview}\"\n"
299
+
300
+ prompt = COMPANY_MATCH_PROMPT.format(
301
+ query_name=query_name,
302
+ context_line=context_line,
303
+ candidates="\n".join(candidate_lines),
304
+ )
305
+
306
+ # Get LLM response
307
+ response = self._llm.generate(prompt, max_tokens=10, stop=["\n"])
308
+ response = response.strip()
309
+
310
+ logger.info(f" LLM response for '{query_name}': {response}")
311
+
312
+ # Parse response
313
+ if response.upper() == "NONE":
314
+ logger.info(f" LLM chose: NONE (no match)")
315
+ return None
316
+
317
+ try:
318
+ idx = int(response) - 1
319
+ if 0 <= idx < len(candidates):
320
+ chosen = candidates[idx]
321
+ logger.info(f" LLM chose: #{idx + 1} '{chosen[0].name}' (sim={chosen[1]:.3f})")
322
+ return chosen
323
+ except ValueError:
324
+ logger.warning(f" LLM response '{response}' could not be parsed as number")
325
+
326
+ # Fallback to top match if LLM response is unclear
327
+ if candidates[0][1] >= 0.8:
328
+ logger.info(f" Fallback to top match: '{candidates[0][0].name}' (sim={candidates[0][1]:.3f})")
329
+ return candidates[0]
330
+
331
+ logger.info(f" No confident match (top sim={candidates[0][1]:.3f} < 0.8)")
332
+ return None
333
+
334
+ def _build_canonical_entity(
335
+ self,
336
+ entity: ExtractedEntity,
337
+ record,
338
+ similarity: float,
339
+ ) -> CanonicalEntity:
340
+ """Build CanonicalEntity from a matched company record."""
341
+ # Map source names to identifier prefixes
342
+ source = record.source
343
+ source_id = record.source_id
344
+ source_prefix_map = {
345
+ "gleif": "LEI",
346
+ "sec_edgar": "SEC-CIK",
347
+ "companies_house": "UK-CH",
348
+ "wikidata": "WIKIDATA",
349
+ }
350
+ source_prefix = source_prefix_map.get(source, source.upper())
351
+
352
+ # Build identifiers dict
353
+ identifiers = {
354
+ "source": source_prefix,
355
+ "source_id": source_id,
356
+ "canonical_id": f"{source_prefix}:{source_id}",
357
+ }
358
+
359
+ # Add source-specific identifiers for compatibility
360
+ if source == "gleif":
361
+ identifiers["lei"] = source_id
362
+ elif source == "sec_edgar":
363
+ identifiers["sec_cik"] = source_id
364
+ if record.record.get("ticker"):
365
+ identifiers["ticker"] = record.record["ticker"]
366
+ elif source == "companies_house":
367
+ identifiers["ch_number"] = source_id
368
+
369
+ # Extract location info from record
370
+ record_data = record.record
371
+ jurisdiction = record_data.get("jurisdiction")
372
+ country = record_data.get("country")
373
+ city = record_data.get("city")
374
+ region = record.region # From CompanyRecord
375
+
376
+ # Build qualifiers
377
+ qualifiers = EntityQualifiers(
378
+ legal_name=record.name,
379
+ region=region,
380
+ jurisdiction=jurisdiction,
381
+ country=country,
382
+ city=city,
383
+ identifiers=identifiers,
384
+ )
385
+
386
+ # Create QualifiedEntity
387
+ qualified = QualifiedEntity(
388
+ entity_ref=entity.entity_ref,
389
+ original_text=entity.text,
390
+ entity_type=entity.type,
391
+ qualifiers=qualifiers,
392
+ qualification_sources=[self.name],
393
+ )
394
+
395
+ # Build FQN: "LEGAL_NAME (SOURCE,REGION)"
396
+ fqn_parts = [source_prefix]
397
+ if region:
398
+ fqn_parts.append(region)
399
+ fqn = f"{record.name} ({','.join(fqn_parts)})"
400
+
401
+ # Create canonical match (clamp confidence to [0, 1] for float precision)
402
+ clamped_confidence = min(max(similarity, 0.0), 1.0)
403
+ canonical_match = CanonicalMatch(
404
+ canonical_id=f"{source_prefix}:{source_id}",
405
+ canonical_name=record.name,
406
+ match_method="embedding",
407
+ match_confidence=clamped_confidence,
408
+ match_details={"source": source, "similarity": similarity},
409
+ )
410
+
411
+ return CanonicalEntity(
412
+ entity_ref=entity.entity_ref,
413
+ qualified_entity=qualified,
414
+ canonical_match=canonical_match,
415
+ fqn=fqn,
416
+ )
417
+
418
+
419
+ # For testing without decorator
420
+ EmbeddingCompanyQualifierClass = EmbeddingCompanyQualifier
@@ -0,0 +1,197 @@
1
+ """
2
+ GLEIFQualifierPlugin - Qualifies ORG entities with LEI and related data.
3
+
4
+ DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
5
+ embedding database with pre-loaded GLEIF data for faster, offline matching.
6
+
7
+ Uses the GLEIF (Global Legal Entity Identifier Foundation) API to:
8
+ - Look up LEI by organization name
9
+ - Retrieve legal name, jurisdiction, parent company info
10
+ """
11
+
12
+ import logging
13
+ import warnings
14
+ from typing import Optional
15
+ from urllib.parse import quote
16
+
17
+ from ..base import BaseQualifierPlugin, PluginCapability
18
+ from ...pipeline.context import PipelineContext
19
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # GLEIF API base URL
24
+ GLEIF_API_BASE = "https://api.gleif.org/api/v1"
25
+
26
+
27
+ # DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
28
+ class GLEIFQualifierPlugin(BaseQualifierPlugin):
29
+ """
30
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
31
+
32
+ Qualifier plugin for ORG entities using GLEIF API.
33
+ Looks up Legal Entity Identifiers (LEI) and related corporate data.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ timeout: int = 10,
39
+ cache_results: bool = True,
40
+ ):
41
+ """
42
+ Initialize the GLEIF qualifier.
43
+
44
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
45
+
46
+ Args:
47
+ timeout: API request timeout in seconds
48
+ cache_results: Whether to cache API results
49
+ """
50
+ warnings.warn(
51
+ "GLEIFQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
52
+ DeprecationWarning,
53
+ stacklevel=2,
54
+ )
55
+ self._timeout = timeout
56
+ self._cache_results = cache_results
57
+ self._cache: dict[str, Optional[dict]] = {}
58
+
59
+ @property
60
+ def name(self) -> str:
61
+ return "gleif_qualifier"
62
+
63
+ @property
64
+ def priority(self) -> int:
65
+ return 10 # High priority for ORG entities
66
+
67
+ @property
68
+ def capabilities(self) -> PluginCapability:
69
+ return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
70
+
71
+ @property
72
+ def description(self) -> str:
73
+ return "Looks up LEI and corporate data from GLEIF API"
74
+
75
+ @property
76
+ def supported_entity_types(self) -> set[EntityType]:
77
+ return {EntityType.ORG}
78
+
79
+ @property
80
+ def supported_identifier_types(self) -> list[str]:
81
+ return ["lei"] # Can lookup by existing LEI
82
+
83
+ @property
84
+ def provided_identifier_types(self) -> list[str]:
85
+ return ["lei"] # Provides LEI
86
+
87
+ def qualify(
88
+ self,
89
+ entity: ExtractedEntity,
90
+ context: PipelineContext,
91
+ ) -> Optional[EntityQualifiers]:
92
+ """
93
+ Qualify an ORG entity with GLEIF data.
94
+
95
+ Args:
96
+ entity: The ORG entity to qualify
97
+ context: Pipeline context
98
+
99
+ Returns:
100
+ EntityQualifiers with LEI and jurisdiction, or None if not found
101
+ """
102
+ if entity.type != EntityType.ORG:
103
+ return None
104
+
105
+ # Check cache first
106
+ cache_key = entity.text.lower().strip()
107
+ if self._cache_results and cache_key in self._cache:
108
+ cached = self._cache[cache_key]
109
+ if cached is None:
110
+ return None
111
+ return self._data_to_qualifiers(cached)
112
+
113
+ # Search GLEIF API
114
+ result = self._search_gleif(entity.text)
115
+
116
+ # Cache result
117
+ if self._cache_results:
118
+ self._cache[cache_key] = result
119
+
120
+ if result:
121
+ return self._data_to_qualifiers(result)
122
+
123
+ return None
124
+
125
+ def _search_gleif(self, org_name: str) -> Optional[dict]:
126
+ """Search GLEIF API for organization."""
127
+ try:
128
+ import requests
129
+
130
+ # Fuzzy name search
131
+ url = f"{GLEIF_API_BASE}/lei-records"
132
+ params = {
133
+ "filter[entity.legalName]": org_name,
134
+ "page[size]": 5,
135
+ }
136
+
137
+ response = requests.get(url, params=params, timeout=self._timeout)
138
+ response.raise_for_status()
139
+ data = response.json()
140
+
141
+ records = data.get("data", [])
142
+ if not records:
143
+ # Try fulltext search as fallback
144
+ params = {
145
+ "filter[fulltext]": org_name,
146
+ "page[size]": 5,
147
+ }
148
+ response = requests.get(url, params=params, timeout=self._timeout)
149
+ response.raise_for_status()
150
+ data = response.json()
151
+ records = data.get("data", [])
152
+
153
+ if records:
154
+ # Return first match
155
+ record = records[0]
156
+ return self._parse_lei_record(record)
157
+
158
+ except ImportError:
159
+ logger.warning("requests library not available for GLEIF API")
160
+ except Exception as e:
161
+ logger.debug(f"GLEIF API error: {e}")
162
+
163
+ return None
164
+
165
+ def _parse_lei_record(self, record: dict) -> dict:
166
+ """Parse a GLEIF LEI record into a simplified dict."""
167
+ attrs = record.get("attributes", {})
168
+ entity = attrs.get("entity", {})
169
+ legal_name = entity.get("legalName", {}).get("name", "")
170
+ legal_address = entity.get("legalAddress", {})
171
+ jurisdiction = entity.get("jurisdiction", "")
172
+
173
+ return {
174
+ "lei": record.get("id", ""),
175
+ "legal_name": legal_name,
176
+ "jurisdiction": jurisdiction,
177
+ "country": legal_address.get("country", ""),
178
+ "city": legal_address.get("city", ""),
179
+ "status": attrs.get("registration", {}).get("status", ""),
180
+ }
181
+
182
+ def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
183
+ """Convert GLEIF data to EntityQualifiers."""
184
+ identifiers = {}
185
+ if data.get("lei"):
186
+ identifiers["lei"] = data["lei"]
187
+
188
+ return EntityQualifiers(
189
+ jurisdiction=data.get("jurisdiction"),
190
+ country=data.get("country"),
191
+ city=data.get("city"),
192
+ identifiers=identifiers,
193
+ )
194
+
195
+
196
+ # Allow importing without decorator for testing
197
+ GLEIFQualifierPluginClass = GLEIFQualifierPlugin