corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +460 -21
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +32 -47
  8. statement_extractor/gliner_extraction.py +218 -0
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +15 -6
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. statement_extractor/scoring.py +17 -69
  52. corp_extractor-0.3.0.dist-info/RECORD +0 -12
  53. statement_extractor/spacy_extraction.py +0 -386
  54. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  55. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,466 @@
1
+ """
2
+ EmbeddingTaxonomyLabeler - Classifies statements using embedding similarity.
3
+
4
+ Uses sentence-transformers to embed text and compare to pre-computed label
5
+ embeddings using cosine similarity with sigmoid calibration.
6
+
7
+ This is faster than MNLI but may be less accurate for nuanced classification.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ import numpy as np
17
+
18
+ from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
19
+ from ...pipeline.context import PipelineContext
20
+ from ...models import (
21
+ PipelineStatement,
22
+ CanonicalEntity,
23
+ StatementLabel,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Default taxonomy file location (relative to this module)
29
+ DEFAULT_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "data" / "statement_taxonomy.json"
30
+
31
+ # Default categories to use
32
+ DEFAULT_CATEGORIES = [
33
+ "environment",
34
+ "society",
35
+ "governance",
36
+ "animals",
37
+ "industry",
38
+ "human_harm",
39
+ "human_benefit",
40
+ "animal_harm",
41
+ "animal_benefit",
42
+ "environment_harm",
43
+ "environment_benefit",
44
+ ]
45
+
46
+
47
+ class EmbeddingClassifier:
48
+ """
49
+ Embedding-based classifier using cosine similarity.
50
+
51
+ Pre-computes embeddings for all labels and uses dot product
52
+ (on normalized vectors) for fast classification.
53
+ """
54
+
55
+ # Calibration parameters to spread out cosine similarity scores
56
+ SIMILARITY_THRESHOLD = 0.65
57
+ CALIBRATION_STEEPNESS = 25.0
58
+
59
+ def __init__(
60
+ self,
61
+ model_name: str = "all-MiniLM-L6-v2",
62
+ device: Optional[str] = None,
63
+ ):
64
+ """
65
+ Initialize the classifier.
66
+
67
+ Args:
68
+ model_name: sentence-transformers model ID
69
+ device: Device to use ('cuda', 'mps', 'cpu', or None for auto)
70
+ """
71
+ self._model_name = model_name
72
+ self._device = device
73
+ self._model = None
74
+
75
+ # Pre-computed label embeddings: {category: {label: embedding}}
76
+ self._label_embeddings: dict[str, dict[str, np.ndarray]] = {}
77
+
78
+ def _load_model(self):
79
+ """Lazy-load the embedding model."""
80
+ if self._model is not None:
81
+ return
82
+
83
+ try:
84
+ from sentence_transformers import SentenceTransformer
85
+ import torch
86
+
87
+ # Auto-detect device
88
+ device = self._device
89
+ if device is None:
90
+ if torch.cuda.is_available():
91
+ device = "cuda"
92
+ elif torch.backends.mps.is_available():
93
+ device = "mps"
94
+ else:
95
+ device = "cpu"
96
+
97
+ logger.info(f"Loading embedding model '{self._model_name}' on {device}...")
98
+ self._model = SentenceTransformer(self._model_name, device=device)
99
+ logger.debug("Embedding model loaded")
100
+
101
+ except ImportError as e:
102
+ raise ImportError(
103
+ "sentence-transformers is required for embedding classification. "
104
+ "Install with: pip install sentence-transformers"
105
+ ) from e
106
+
107
+ def precompute_label_embeddings(
108
+ self,
109
+ taxonomy: dict[str, dict[str, int]],
110
+ categories: Optional[list[str]] = None,
111
+ ) -> None:
112
+ """
113
+ Pre-compute embeddings for all label names.
114
+
115
+ Args:
116
+ taxonomy: Taxonomy dict {category: {label: id, ...}, ...}
117
+ categories: Categories to include (default: all)
118
+ """
119
+ self._load_model()
120
+
121
+ start_time = time.perf_counter()
122
+ total_labels = 0
123
+
124
+ categories_to_process = categories or list(taxonomy.keys())
125
+
126
+ for category in categories_to_process:
127
+ if category not in taxonomy:
128
+ continue
129
+
130
+ labels = taxonomy[category]
131
+ label_names = list(labels.keys())
132
+
133
+ if not label_names:
134
+ continue
135
+
136
+ # Batch embed all labels in this category
137
+ embeddings = self._model.encode(label_names, convert_to_numpy=True)
138
+
139
+ # Normalize and store
140
+ self._label_embeddings[category] = {}
141
+ for label_name, embedding in zip(label_names, embeddings):
142
+ norm = np.linalg.norm(embedding)
143
+ normalized = embedding / (norm + 1e-8)
144
+ self._label_embeddings[category][label_name] = normalized.astype(np.float32)
145
+ total_labels += 1
146
+
147
+ elapsed = time.perf_counter() - start_time
148
+ logger.info(
149
+ f"Pre-computed embeddings for {total_labels} labels "
150
+ f"across {len(self._label_embeddings)} categories in {elapsed:.2f}s"
151
+ )
152
+
153
+ def _calibrate_score(self, raw_similarity: float) -> float:
154
+ """
155
+ Apply sigmoid calibration to amplify score differences.
156
+
157
+ Cosine similarities cluster in a narrow range (0.5-0.9).
158
+ This transformation spreads them out for better discrimination.
159
+ """
160
+ # Normalize from [-1, 1] to [0, 1]
161
+ normalized = (raw_similarity + 1) / 2
162
+
163
+ # Apply sigmoid transformation
164
+ exponent = -self.CALIBRATION_STEEPNESS * (normalized - self.SIMILARITY_THRESHOLD)
165
+ return 1.0 / (1.0 + np.exp(exponent))
166
+
167
+ def classify(
168
+ self,
169
+ text: str,
170
+ categories: Optional[list[str]] = None,
171
+ top_k: int = 5,
172
+ min_score: float = 0.3,
173
+ ) -> list[tuple[str, str, float]]:
174
+ """
175
+ Classify text against pre-computed label embeddings.
176
+
177
+ Args:
178
+ text: Text to classify
179
+ categories: Categories to search (default: all pre-computed)
180
+ top_k: Number of top results to return
181
+ min_score: Minimum calibrated score threshold
182
+
183
+ Returns:
184
+ List of (category, label, score) tuples, sorted by score descending
185
+ """
186
+ self._load_model()
187
+
188
+ if not self._label_embeddings:
189
+ raise RuntimeError("Label embeddings not pre-computed. Call precompute_label_embeddings first.")
190
+
191
+ # Embed input text
192
+ input_embedding = self._model.encode(text, convert_to_numpy=True)
193
+ input_norm = np.linalg.norm(input_embedding)
194
+ input_normalized = input_embedding / (input_norm + 1e-8)
195
+
196
+ # Classify against each category
197
+ categories_to_process = categories or list(self._label_embeddings.keys())
198
+ all_results: list[tuple[str, str, float]] = []
199
+
200
+ for category in categories_to_process:
201
+ if category not in self._label_embeddings:
202
+ continue
203
+
204
+ for label, label_embedding in self._label_embeddings[category].items():
205
+ # Cosine similarity (both vectors are normalized)
206
+ raw_sim = float(np.dot(input_normalized, label_embedding))
207
+ calibrated_score = self._calibrate_score(raw_sim)
208
+
209
+ if calibrated_score >= min_score:
210
+ all_results.append((category, label, calibrated_score))
211
+
212
+ # Sort by score descending and return top-k
213
+ all_results.sort(key=lambda x: x[2], reverse=True)
214
+ return all_results[:top_k]
215
+
216
+ def classify_hierarchical(
217
+ self,
218
+ text: str,
219
+ top_k_categories: int = 3,
220
+ top_k_labels: int = 3,
221
+ min_score: float = 0.3,
222
+ ) -> tuple[str, str, float]:
223
+ """
224
+ Hierarchical classification: find best category, then best label.
225
+
226
+ More efficient for very large taxonomies.
227
+
228
+ Args:
229
+ text: Text to classify
230
+ top_k_categories: Number of top categories to consider
231
+ top_k_labels: Number of labels per category to consider
232
+ min_score: Minimum score threshold
233
+
234
+ Returns:
235
+ Tuple of (category, label, score) for best match
236
+ """
237
+ self._load_model()
238
+
239
+ if not self._label_embeddings:
240
+ raise RuntimeError("Label embeddings not pre-computed.")
241
+
242
+ # Embed input text
243
+ input_embedding = self._model.encode(text, convert_to_numpy=True)
244
+ input_norm = np.linalg.norm(input_embedding)
245
+ input_normalized = input_embedding / (input_norm + 1e-8)
246
+
247
+ # First, compute average similarity to each category
248
+ category_scores: list[tuple[str, float]] = []
249
+ for category, labels in self._label_embeddings.items():
250
+ if not labels:
251
+ continue
252
+
253
+ # Average similarity to all labels in category
254
+ sims = []
255
+ for label_embedding in labels.values():
256
+ sim = float(np.dot(input_normalized, label_embedding))
257
+ sims.append(sim)
258
+
259
+ avg_sim = np.mean(sims)
260
+ category_scores.append((category, avg_sim))
261
+
262
+ # Sort categories by average similarity
263
+ category_scores.sort(key=lambda x: x[1], reverse=True)
264
+
265
+ # Find best label within top categories
266
+ best_result = (None, None, 0.0)
267
+
268
+ for category, _ in category_scores[:top_k_categories]:
269
+ for label, label_embedding in self._label_embeddings[category].items():
270
+ raw_sim = float(np.dot(input_normalized, label_embedding))
271
+ calibrated_score = self._calibrate_score(raw_sim)
272
+
273
+ if calibrated_score > best_result[2]:
274
+ best_result = (category, label, calibrated_score)
275
+
276
+ if best_result[0] and best_result[2] >= min_score:
277
+ return best_result
278
+
279
+ return (None, None, 0.0)
280
+
281
+
282
+ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
283
+ """
284
+ Labeler that classifies statements using embedding similarity.
285
+
286
+ Faster than MNLI but may be less accurate for nuanced classification.
287
+ Good for high-throughput scenarios.
288
+ """
289
+
290
+ def __init__(
291
+ self,
292
+ taxonomy_path: Optional[str | Path] = None,
293
+ categories: Optional[list[str]] = None,
294
+ model_name: str = "all-MiniLM-L6-v2",
295
+ use_hierarchical: bool = True,
296
+ top_k_categories: int = 3,
297
+ min_confidence: float = 0.3,
298
+ ):
299
+ """
300
+ Initialize the embedding taxonomy labeler.
301
+
302
+ Args:
303
+ taxonomy_path: Path to taxonomy JSON file (default: built-in taxonomy)
304
+ categories: List of categories to use (default: all categories)
305
+ model_name: sentence-transformers model ID
306
+ use_hierarchical: Use hierarchical classification for efficiency
307
+ top_k_categories: Number of top categories to consider in hierarchical mode
308
+ min_confidence: Minimum confidence threshold for returning a label
309
+ """
310
+ self._taxonomy_path = Path(taxonomy_path) if taxonomy_path else DEFAULT_TAXONOMY_PATH
311
+ self._categories = categories or DEFAULT_CATEGORIES
312
+ self._model_name = model_name
313
+ self._use_hierarchical = use_hierarchical
314
+ self._top_k_categories = top_k_categories
315
+ self._min_confidence = min_confidence
316
+
317
+ self._taxonomy: Optional[dict[str, dict[str, int]]] = None
318
+ self._classifier: Optional[EmbeddingClassifier] = None
319
+ self._embeddings_computed = False
320
+
321
+ @property
322
+ def name(self) -> str:
323
+ return "embedding_taxonomy_labeler"
324
+
325
+ @property
326
+ def priority(self) -> int:
327
+ return 45 # Higher priority than MNLI - default taxonomy labeler (faster)
328
+
329
+ @property
330
+ def capabilities(self) -> PluginCapability:
331
+ return PluginCapability.LLM_REQUIRED | PluginCapability.BATCH_PROCESSING
332
+
333
+ @property
334
+ def description(self) -> str:
335
+ return "Classifies statements using embedding similarity (faster than MNLI)"
336
+
337
+ @property
338
+ def label_type(self) -> str:
339
+ return "taxonomy_embedding"
340
+
341
+ @property
342
+ def taxonomy_schema(self) -> TaxonomySchema:
343
+ """Provide taxonomy schema (for documentation/introspection)."""
344
+ taxonomy = self._load_taxonomy()
345
+ filtered = {cat: list(labels.keys()) for cat, labels in taxonomy.items() if cat in self._categories}
346
+ return TaxonomySchema(
347
+ label_type=self.label_type,
348
+ values=filtered,
349
+ description="Statement topic classification using embedding similarity",
350
+ scope="statement",
351
+ )
352
+
353
+ def _load_taxonomy(self) -> dict[str, dict[str, int]]:
354
+ """Load taxonomy from JSON file."""
355
+ if self._taxonomy is not None:
356
+ return self._taxonomy
357
+
358
+ if not self._taxonomy_path.exists():
359
+ raise FileNotFoundError(f"Taxonomy file not found: {self._taxonomy_path}")
360
+
361
+ with open(self._taxonomy_path) as f:
362
+ self._taxonomy = json.load(f)
363
+
364
+ logger.debug(f"Loaded taxonomy with {len(self._taxonomy)} categories")
365
+ return self._taxonomy
366
+
367
+ def _get_classifier(self) -> EmbeddingClassifier:
368
+ """Get or create the embedding classifier."""
369
+ if self._classifier is None:
370
+ self._classifier = EmbeddingClassifier(model_name=self._model_name)
371
+
372
+ if not self._embeddings_computed:
373
+ taxonomy = self._load_taxonomy()
374
+ self._classifier.precompute_label_embeddings(taxonomy, self._categories)
375
+ self._embeddings_computed = True
376
+
377
+ return self._classifier
378
+
379
+ def label(
380
+ self,
381
+ statement: PipelineStatement,
382
+ subject_canonical: CanonicalEntity,
383
+ object_canonical: CanonicalEntity,
384
+ context: PipelineContext,
385
+ ) -> Optional[StatementLabel]:
386
+ """
387
+ Classify statement using embedding similarity.
388
+
389
+ Args:
390
+ statement: The statement to label
391
+ subject_canonical: Canonicalized subject
392
+ object_canonical: Canonicalized object
393
+ context: Pipeline context
394
+
395
+ Returns:
396
+ StatementLabel with taxonomy classification, or None if below threshold
397
+ """
398
+ # Check for pre-computed classification
399
+ result = context.get_classification(statement.source_text, self.label_type)
400
+ if result:
401
+ label_value, confidence = result
402
+ if confidence >= self._min_confidence:
403
+ return StatementLabel(
404
+ label_type=self.label_type,
405
+ label_value=label_value,
406
+ confidence=confidence,
407
+ labeler=self.name,
408
+ )
409
+ return None
410
+
411
+ # Run embedding classification
412
+ try:
413
+ classifier = self._get_classifier()
414
+ text = statement.source_text
415
+
416
+ if self._use_hierarchical:
417
+ category, label, confidence = classifier.classify_hierarchical(
418
+ text,
419
+ top_k_categories=self._top_k_categories,
420
+ min_score=self._min_confidence,
421
+ )
422
+ if category and label:
423
+ full_label = f"{category}:{label}"
424
+ else:
425
+ return None
426
+ else:
427
+ results = classifier.classify(
428
+ text,
429
+ top_k=1,
430
+ min_score=self._min_confidence,
431
+ )
432
+ if results:
433
+ category, label, confidence = results[0]
434
+ full_label = f"{category}:{label}"
435
+ else:
436
+ return None
437
+
438
+ # Get the numeric ID for reproducibility
439
+ label_id = self._get_label_id(category, label)
440
+
441
+ return StatementLabel(
442
+ label_type=self.label_type,
443
+ label_value=full_label,
444
+ confidence=round(confidence, 4),
445
+ labeler=self.name,
446
+ metadata={"label_id": label_id, "category": category},
447
+ )
448
+
449
+ except Exception as e:
450
+ logger.warning(f"Embedding taxonomy classification failed: {e}")
451
+
452
+ return None
453
+
454
+ def _get_label_id(self, category: str, label: str) -> Optional[int]:
455
+ """Get the numeric ID for a label."""
456
+ taxonomy = self._load_taxonomy()
457
+
458
+ if category in taxonomy:
459
+ return taxonomy[category].get(label)
460
+
461
+ return None
462
+
463
+
464
+ # Allow importing without decorator for testing
465
+ EmbeddingTaxonomyLabelerClass = EmbeddingTaxonomyLabeler
466
+ EmbeddingClassifierClass = EmbeddingClassifier
@@ -0,0 +1,19 @@
1
+ """
2
+ Qualifier plugins for Stage 3 (Qualification).
3
+
4
+ Adds qualifiers and identifiers to entities.
5
+ """
6
+
7
+ from .base import BaseQualifierPlugin
8
+ from .person import PersonQualifierPlugin
9
+ from .gleif import GLEIFQualifierPlugin
10
+ from .companies_house import CompaniesHouseQualifierPlugin
11
+ from .sec_edgar import SECEdgarQualifierPlugin
12
+
13
+ __all__ = [
14
+ "BaseQualifierPlugin",
15
+ "PersonQualifierPlugin",
16
+ "GLEIFQualifierPlugin",
17
+ "CompaniesHouseQualifierPlugin",
18
+ "SECEdgarQualifierPlugin",
19
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Base class for qualifier plugins.
3
+
4
+ Re-exports BaseQualifierPlugin from the main plugins module.
5
+ """
6
+
7
+ from ..base import BaseQualifierPlugin
8
+
9
+ __all__ = ["BaseQualifierPlugin"]
@@ -0,0 +1,174 @@
1
+ """
2
+ CompaniesHouseQualifierPlugin - Qualifies UK ORG entities.
3
+
4
+ Uses the UK Companies House API to:
5
+ - Look up company number by name
6
+ - Retrieve company details, jurisdiction, officers
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from typing import Optional
12
+
13
+ from ..base import BaseQualifierPlugin, PluginCapability
14
+ from ...pipeline.context import PipelineContext
15
+ from ...pipeline.registry import PluginRegistry
16
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Companies House API base URL
21
+ CH_API_BASE = "https://api.company-information.service.gov.uk"
22
+
23
+
24
+ @PluginRegistry.qualifier
25
+ class CompaniesHouseQualifierPlugin(BaseQualifierPlugin):
26
+ """
27
+ Qualifier plugin for UK ORG entities using Companies House API.
28
+
29
+ Requires COMPANIES_HOUSE_API_KEY environment variable.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ api_key: Optional[str] = None,
35
+ timeout: int = 10,
36
+ cache_results: bool = True,
37
+ ):
38
+ """
39
+ Initialize the Companies House qualifier.
40
+
41
+ Args:
42
+ api_key: Companies House API key (or use COMPANIES_HOUSE_API_KEY env var)
43
+ timeout: API request timeout in seconds
44
+ cache_results: Whether to cache API results
45
+ """
46
+ self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
47
+ self._timeout = timeout
48
+ self._cache_results = cache_results
49
+ self._cache: dict[str, Optional[dict]] = {}
50
+
51
+ @property
52
+ def name(self) -> str:
53
+ return "companies_house_qualifier"
54
+
55
+ @property
56
+ def priority(self) -> int:
57
+ return 20 # Run after GLEIF
58
+
59
+ @property
60
+ def capabilities(self) -> PluginCapability:
61
+ return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
62
+
63
+ @property
64
+ def description(self) -> str:
65
+ return "Looks up UK company data from Companies House API"
66
+
67
+ @property
68
+ def supported_entity_types(self) -> set[EntityType]:
69
+ return {EntityType.ORG}
70
+
71
+ @property
72
+ def supported_identifier_types(self) -> list[str]:
73
+ return ["ch_number"] # Can lookup by company number
74
+
75
+ @property
76
+ def provided_identifier_types(self) -> list[str]:
77
+ return ["ch_number"] # Provides company number
78
+
79
+ def qualify(
80
+ self,
81
+ entity: ExtractedEntity,
82
+ context: PipelineContext,
83
+ ) -> Optional[EntityQualifiers]:
84
+ """
85
+ Qualify an ORG entity with Companies House data.
86
+
87
+ Args:
88
+ entity: The ORG entity to qualify
89
+ context: Pipeline context
90
+
91
+ Returns:
92
+ EntityQualifiers with company number, or None if not found
93
+ """
94
+ if entity.type != EntityType.ORG:
95
+ return None
96
+
97
+ if not self._api_key:
98
+ logger.debug("Companies House API key not configured")
99
+ return None
100
+
101
+ # Check cache first
102
+ cache_key = entity.text.lower().strip()
103
+ if self._cache_results and cache_key in self._cache:
104
+ cached = self._cache[cache_key]
105
+ if cached is None:
106
+ return None
107
+ return self._data_to_qualifiers(cached)
108
+
109
+ # Search Companies House API
110
+ result = self._search_companies_house(entity.text)
111
+
112
+ # Cache result
113
+ if self._cache_results:
114
+ self._cache[cache_key] = result
115
+
116
+ if result:
117
+ return self._data_to_qualifiers(result)
118
+
119
+ return None
120
+
121
+ def _search_companies_house(self, org_name: str) -> Optional[dict]:
122
+ """Search Companies House API for organization."""
123
+ try:
124
+ import requests
125
+ from requests.auth import HTTPBasicAuth
126
+
127
+ url = f"{CH_API_BASE}/search/companies"
128
+ params = {"q": org_name, "items_per_page": 5}
129
+
130
+ response = requests.get(
131
+ url,
132
+ params=params,
133
+ auth=HTTPBasicAuth(self._api_key, ""),
134
+ timeout=self._timeout,
135
+ )
136
+ response.raise_for_status()
137
+ data = response.json()
138
+
139
+ items = data.get("items", [])
140
+ if items:
141
+ # Return first match
142
+ company = items[0]
143
+ return {
144
+ "ch_number": company.get("company_number", ""),
145
+ "title": company.get("title", ""),
146
+ "company_status": company.get("company_status", ""),
147
+ "company_type": company.get("company_type", ""),
148
+ "jurisdiction": "UK",
149
+ "country": "GB",
150
+ "address": company.get("address_snippet", ""),
151
+ }
152
+
153
+ except ImportError:
154
+ logger.warning("requests library not available for Companies House API")
155
+ except Exception as e:
156
+ logger.debug(f"Companies House API error: {e}")
157
+
158
+ return None
159
+
160
+ def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
161
+ """Convert Companies House data to EntityQualifiers."""
162
+ identifiers = {}
163
+ if data.get("ch_number"):
164
+ identifiers["ch_number"] = data["ch_number"]
165
+
166
+ return EntityQualifiers(
167
+ jurisdiction=data.get("jurisdiction"),
168
+ country=data.get("country"),
169
+ identifiers=identifiers,
170
+ )
171
+
172
+
173
+ # Allow importing without decorator for testing
174
+ CompaniesHouseQualifierPluginClass = CompaniesHouseQualifierPlugin