corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
- corp_extractor-0.5.0.dist-info/RECORD +55 -0
- statement_extractor/__init__.py +9 -0
- statement_extractor/cli.py +446 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +1182 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +74 -0
- statement_extractor/models/canonical.py +139 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +191 -0
- statement_extractor/models/qualifiers.py +91 -0
- statement_extractor/models/statement.py +75 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +134 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +447 -0
- statement_extractor/pipeline/registry.py +297 -0
- statement_extractor/plugins/__init__.py +43 -0
- statement_extractor/plugins/base.py +446 -0
- statement_extractor/plugins/canonicalizers/__init__.py +17 -0
- statement_extractor/plugins/canonicalizers/base.py +9 -0
- statement_extractor/plugins/canonicalizers/location.py +219 -0
- statement_extractor/plugins/canonicalizers/organization.py +230 -0
- statement_extractor/plugins/canonicalizers/person.py +242 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +536 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +373 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
- statement_extractor/plugins/qualifiers/__init__.py +19 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +174 -0
- statement_extractor/plugins/qualifiers/gleif.py +186 -0
- statement_extractor/plugins/qualifiers/person.py +221 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +188 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +337 -0
- statement_extractor/plugins/taxonomy/mnli.py +279 -0
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base plugin classes for the extraction pipeline.
|
|
3
|
+
|
|
4
|
+
Defines the abstract interfaces for each pipeline stage:
|
|
5
|
+
- BaseSplitterPlugin: Stage 1 - Text → RawTriple
|
|
6
|
+
- BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
|
|
7
|
+
- BaseQualifierPlugin: Stage 3 - Entity → EntityQualifiers
|
|
8
|
+
- BaseCanonicalizerPlugin: Stage 4 - QualifiedEntity → CanonicalMatch
|
|
9
|
+
- BaseLabelerPlugin: Stage 5 - Statement → StatementLabel
|
|
10
|
+
- BaseTaxonomyPlugin: Stage 6 - Statement → TaxonomyResult
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
from enum import Flag, auto
|
|
15
|
+
from typing import TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from ..pipeline.context import PipelineContext
|
|
19
|
+
from ..models import (
|
|
20
|
+
RawTriple,
|
|
21
|
+
PipelineStatement,
|
|
22
|
+
ExtractedEntity,
|
|
23
|
+
EntityQualifiers,
|
|
24
|
+
QualifiedEntity,
|
|
25
|
+
CanonicalMatch,
|
|
26
|
+
CanonicalEntity,
|
|
27
|
+
StatementLabel,
|
|
28
|
+
TaxonomyResult,
|
|
29
|
+
EntityType,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PluginCapability(Flag):
|
|
34
|
+
"""Flags indicating plugin capabilities."""
|
|
35
|
+
NONE = 0
|
|
36
|
+
BATCH_PROCESSING = auto() # Can process multiple items at once
|
|
37
|
+
ASYNC_PROCESSING = auto() # Supports async execution
|
|
38
|
+
EXTERNAL_API = auto() # Uses external API (may have rate limits)
|
|
39
|
+
LLM_REQUIRED = auto() # Requires an LLM model
|
|
40
|
+
CACHING = auto() # Supports result caching
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BasePlugin(ABC):
|
|
44
|
+
"""
|
|
45
|
+
Base class for all pipeline plugins.
|
|
46
|
+
|
|
47
|
+
All plugins must implement the name property and can optionally
|
|
48
|
+
override priority and capabilities.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def name(self) -> str:
|
|
54
|
+
"""Unique name for this plugin (used for registration and CLI)."""
|
|
55
|
+
...
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def priority(self) -> int:
|
|
59
|
+
"""
|
|
60
|
+
Plugin priority (lower = higher priority, runs first).
|
|
61
|
+
|
|
62
|
+
Default is 100. Use lower values (e.g., 10, 20) for critical plugins
|
|
63
|
+
that should run before others.
|
|
64
|
+
"""
|
|
65
|
+
return 100
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def capabilities(self) -> PluginCapability:
|
|
69
|
+
"""Plugin capabilities (flags)."""
|
|
70
|
+
return PluginCapability.NONE
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def description(self) -> str:
|
|
74
|
+
"""Human-readable description of this plugin."""
|
|
75
|
+
return ""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class BaseSplitterPlugin(BasePlugin):
|
|
79
|
+
"""
|
|
80
|
+
Stage 1 plugin: Split text into atomic triples.
|
|
81
|
+
|
|
82
|
+
Takes raw text and produces RawTriple objects containing
|
|
83
|
+
subject/predicate/object text and source sentence.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def split(
|
|
88
|
+
self,
|
|
89
|
+
text: str,
|
|
90
|
+
context: "PipelineContext",
|
|
91
|
+
) -> list["RawTriple"]:
|
|
92
|
+
"""
|
|
93
|
+
Split text into atomic triples.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Input text to split
|
|
97
|
+
context: Pipeline context for accessing metadata and config
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of RawTriple objects
|
|
101
|
+
"""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class BaseExtractorPlugin(BasePlugin):
|
|
106
|
+
"""
|
|
107
|
+
Stage 2 plugin: Refine triples into statements with typed entities.
|
|
108
|
+
|
|
109
|
+
Takes RawTriple objects and produces PipelineStatement objects
|
|
110
|
+
with ExtractedEntity subjects/objects that have types, spans,
|
|
111
|
+
and confidence scores.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
@abstractmethod
|
|
115
|
+
def extract(
|
|
116
|
+
self,
|
|
117
|
+
raw_triples: list["RawTriple"],
|
|
118
|
+
context: "PipelineContext",
|
|
119
|
+
) -> list["PipelineStatement"]:
|
|
120
|
+
"""
|
|
121
|
+
Extract statements from raw triples.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
raw_triples: Raw triples from Stage 1
|
|
125
|
+
context: Pipeline context
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of PipelineStatement objects with typed entities
|
|
129
|
+
"""
|
|
130
|
+
...
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class BaseQualifierPlugin(BasePlugin):
|
|
134
|
+
"""
|
|
135
|
+
Stage 3 plugin: Add qualifiers and identifiers to entities.
|
|
136
|
+
|
|
137
|
+
Processes entities of specific types and adds semantic qualifiers
|
|
138
|
+
(role, org) or external identifiers (LEI, company number).
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
@abstractmethod
|
|
143
|
+
def supported_entity_types(self) -> set["EntityType"]:
|
|
144
|
+
"""Entity types this plugin can qualify (e.g., {ORG, PERSON})."""
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def supported_identifier_types(self) -> list[str]:
|
|
149
|
+
"""
|
|
150
|
+
Identifier types this plugin can use for lookup.
|
|
151
|
+
|
|
152
|
+
For example, GLEIFQualifier can lookup by 'lei'.
|
|
153
|
+
"""
|
|
154
|
+
return []
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def provided_identifier_types(self) -> list[str]:
|
|
158
|
+
"""
|
|
159
|
+
Identifier types this plugin can provide.
|
|
160
|
+
|
|
161
|
+
For example, GLEIFQualifier provides 'lei', 'jurisdiction'.
|
|
162
|
+
"""
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def qualify(
|
|
167
|
+
self,
|
|
168
|
+
entity: "ExtractedEntity",
|
|
169
|
+
context: "PipelineContext",
|
|
170
|
+
) -> "EntityQualifiers | None":
|
|
171
|
+
"""
|
|
172
|
+
Add qualifiers to an entity.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
entity: The entity to qualify
|
|
176
|
+
context: Pipeline context (for accessing source text, other entities)
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
EntityQualifiers with added information, or None if nothing to add
|
|
180
|
+
"""
|
|
181
|
+
...
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class BaseCanonicalizerPlugin(BasePlugin):
|
|
185
|
+
"""
|
|
186
|
+
Stage 4 plugin: Resolve entities to canonical forms.
|
|
187
|
+
|
|
188
|
+
Takes qualified entities and finds their canonical representations
|
|
189
|
+
using various matching strategies (identifier, name, fuzzy, LLM).
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
@abstractmethod
|
|
194
|
+
def supported_entity_types(self) -> set["EntityType"]:
|
|
195
|
+
"""Entity types this plugin can canonicalize."""
|
|
196
|
+
...
|
|
197
|
+
|
|
198
|
+
@abstractmethod
|
|
199
|
+
def find_canonical(
|
|
200
|
+
self,
|
|
201
|
+
entity: "QualifiedEntity",
|
|
202
|
+
context: "PipelineContext",
|
|
203
|
+
) -> "CanonicalMatch | None":
|
|
204
|
+
"""
|
|
205
|
+
Find canonical form for an entity.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
entity: Qualified entity to canonicalize
|
|
209
|
+
context: Pipeline context
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
CanonicalMatch if found, None otherwise
|
|
213
|
+
"""
|
|
214
|
+
...
|
|
215
|
+
|
|
216
|
+
def format_fqn(
|
|
217
|
+
self,
|
|
218
|
+
entity: "QualifiedEntity",
|
|
219
|
+
match: "CanonicalMatch | None",
|
|
220
|
+
) -> str:
|
|
221
|
+
"""
|
|
222
|
+
Format the fully qualified name for display.
|
|
223
|
+
|
|
224
|
+
Can be overridden by subclasses for custom formatting.
|
|
225
|
+
Default implementation uses CanonicalEntity._generate_fqn.
|
|
226
|
+
"""
|
|
227
|
+
from ..models import CanonicalEntity
|
|
228
|
+
return CanonicalEntity._generate_fqn(entity, match)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class ClassificationSchema:
|
|
232
|
+
"""
|
|
233
|
+
Schema for simple multi-choice classification (2-20 choices).
|
|
234
|
+
|
|
235
|
+
Handled by GLiNER2 `.classification()` in a single pass.
|
|
236
|
+
|
|
237
|
+
Examples:
|
|
238
|
+
- sentiment: ["positive", "negative", "neutral"]
|
|
239
|
+
- certainty: ["certain", "uncertain", "speculative"]
|
|
240
|
+
- temporality: ["past", "present", "future"]
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
def __init__(
|
|
244
|
+
self,
|
|
245
|
+
label_type: str,
|
|
246
|
+
choices: list[str],
|
|
247
|
+
description: str = "",
|
|
248
|
+
scope: str = "statement", # "statement", "subject", "object", "predicate"
|
|
249
|
+
):
|
|
250
|
+
self.label_type = label_type
|
|
251
|
+
self.choices = choices
|
|
252
|
+
self.description = description
|
|
253
|
+
self.scope = scope
|
|
254
|
+
|
|
255
|
+
def __repr__(self) -> str:
|
|
256
|
+
return f"ClassificationSchema({self.label_type!r}, choices={self.choices!r})"
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class TaxonomySchema:
|
|
260
|
+
"""
|
|
261
|
+
Schema for large taxonomy labeling (100s of values).
|
|
262
|
+
|
|
263
|
+
Too many choices for GLiNER2 classification. Requires MNLI or similar:
|
|
264
|
+
- MNLI zero-shot with label descriptions
|
|
265
|
+
- Embedding-based nearest neighbor search
|
|
266
|
+
- Hierarchical classification (category → subcategory)
|
|
267
|
+
|
|
268
|
+
Examples:
|
|
269
|
+
- industry_code: NAICS/SIC codes (1000+ values)
|
|
270
|
+
- relation_type: detailed relation ontology (100+ types)
|
|
271
|
+
- job_title: standardized job taxonomy
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
def __init__(
|
|
275
|
+
self,
|
|
276
|
+
label_type: str,
|
|
277
|
+
values: list[str] | dict[str, list[str]], # flat list or hierarchical dict
|
|
278
|
+
description: str = "",
|
|
279
|
+
scope: str = "statement", # "statement", "subject", "object", "predicate"
|
|
280
|
+
label_descriptions: dict[str, str] | None = None, # descriptions for MNLI
|
|
281
|
+
):
|
|
282
|
+
self.label_type = label_type
|
|
283
|
+
self.values = values
|
|
284
|
+
self.description = description
|
|
285
|
+
self.scope = scope
|
|
286
|
+
self.label_descriptions = label_descriptions # e.g., {"NAICS:5112": "Software Publishers"}
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def is_hierarchical(self) -> bool:
|
|
290
|
+
"""Check if taxonomy is hierarchical (dict) vs flat (list)."""
|
|
291
|
+
return isinstance(self.values, dict)
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def all_values(self) -> list[str]:
|
|
295
|
+
"""Get all taxonomy values (flattened if hierarchical)."""
|
|
296
|
+
if isinstance(self.values, list):
|
|
297
|
+
return self.values
|
|
298
|
+
# Flatten hierarchical dict
|
|
299
|
+
result = []
|
|
300
|
+
for category, subcategories in self.values.items():
|
|
301
|
+
result.append(category)
|
|
302
|
+
result.extend(subcategories)
|
|
303
|
+
return result
|
|
304
|
+
|
|
305
|
+
def __repr__(self) -> str:
|
|
306
|
+
count = len(self.all_values)
|
|
307
|
+
return f"TaxonomySchema({self.label_type!r}, {count} values)"
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class BaseLabelerPlugin(BasePlugin):
|
|
311
|
+
"""
|
|
312
|
+
Stage 5 plugin: Apply labels to statements.
|
|
313
|
+
|
|
314
|
+
Adds classification labels (sentiment, relation type, confidence)
|
|
315
|
+
to the final labeled statements.
|
|
316
|
+
|
|
317
|
+
Labelers can provide a classification_schema that extractors will use
|
|
318
|
+
to run classification in a single model pass. The results are stored
|
|
319
|
+
in the pipeline context for the labeler to retrieve.
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
@property
|
|
323
|
+
@abstractmethod
|
|
324
|
+
def label_type(self) -> str:
|
|
325
|
+
"""
|
|
326
|
+
The type of label this plugin produces.
|
|
327
|
+
|
|
328
|
+
Examples: 'sentiment', 'relation_type', 'confidence'
|
|
329
|
+
"""
|
|
330
|
+
...
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def classification_schema(self) -> ClassificationSchema | None:
|
|
334
|
+
"""
|
|
335
|
+
Simple multi-choice classification schema (2-20 choices).
|
|
336
|
+
|
|
337
|
+
If provided, GLiNER2 extractor will run `.classification()` and store
|
|
338
|
+
results in context for this labeler to retrieve.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
ClassificationSchema or None
|
|
342
|
+
"""
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
@property
|
|
346
|
+
def taxonomy_schema(self) -> TaxonomySchema | None:
|
|
347
|
+
"""
|
|
348
|
+
Large taxonomy schema (100s of values).
|
|
349
|
+
|
|
350
|
+
If provided, requires MNLI or embedding-based classification.
|
|
351
|
+
Results stored in context for this labeler to retrieve.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
TaxonomySchema or None
|
|
355
|
+
"""
|
|
356
|
+
return None
|
|
357
|
+
|
|
358
|
+
@abstractmethod
|
|
359
|
+
def label(
|
|
360
|
+
self,
|
|
361
|
+
statement: "PipelineStatement",
|
|
362
|
+
subject_canonical: "CanonicalEntity",
|
|
363
|
+
object_canonical: "CanonicalEntity",
|
|
364
|
+
context: "PipelineContext",
|
|
365
|
+
) -> "StatementLabel | None":
|
|
366
|
+
"""
|
|
367
|
+
Apply a label to a statement.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
statement: The statement to label
|
|
371
|
+
subject_canonical: Canonicalized subject entity
|
|
372
|
+
object_canonical: Canonicalized object entity
|
|
373
|
+
context: Pipeline context (check context.classification_results for pre-computed labels)
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
StatementLabel if applicable, None otherwise
|
|
377
|
+
"""
|
|
378
|
+
...
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class BaseTaxonomyPlugin(BasePlugin):
|
|
382
|
+
"""
|
|
383
|
+
Stage 6 plugin: Classify statements against a taxonomy.
|
|
384
|
+
|
|
385
|
+
Taxonomy classification is separate from labeling because:
|
|
386
|
+
- It operates on large taxonomies (100s-1000s of labels)
|
|
387
|
+
- It requires specialized models (MNLI, embeddings)
|
|
388
|
+
- It's computationally heavier than simple labeling
|
|
389
|
+
|
|
390
|
+
Taxonomy plugins produce TaxonomyResult objects that are stored
|
|
391
|
+
in the pipeline context.
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
@abstractmethod
|
|
396
|
+
def taxonomy_name(self) -> str:
|
|
397
|
+
"""
|
|
398
|
+
Name of the taxonomy this plugin classifies against.
|
|
399
|
+
|
|
400
|
+
Examples: 'esg_topics', 'industry_codes', 'relation_types'
|
|
401
|
+
"""
|
|
402
|
+
...
|
|
403
|
+
|
|
404
|
+
@property
|
|
405
|
+
def taxonomy_schema(self) -> TaxonomySchema | None:
|
|
406
|
+
"""
|
|
407
|
+
The taxonomy schema this plugin uses.
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
TaxonomySchema describing the taxonomy structure
|
|
411
|
+
"""
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
@property
|
|
415
|
+
def supported_categories(self) -> list[str]:
|
|
416
|
+
"""
|
|
417
|
+
List of taxonomy categories this plugin supports.
|
|
418
|
+
|
|
419
|
+
Returns empty list if all categories are supported.
|
|
420
|
+
"""
|
|
421
|
+
return []
|
|
422
|
+
|
|
423
|
+
@abstractmethod
|
|
424
|
+
def classify(
|
|
425
|
+
self,
|
|
426
|
+
statement: "PipelineStatement",
|
|
427
|
+
subject_canonical: "CanonicalEntity",
|
|
428
|
+
object_canonical: "CanonicalEntity",
|
|
429
|
+
context: "PipelineContext",
|
|
430
|
+
) -> list["TaxonomyResult"]:
|
|
431
|
+
"""
|
|
432
|
+
Classify a statement against the taxonomy.
|
|
433
|
+
|
|
434
|
+
Returns all labels above the confidence threshold. A single statement
|
|
435
|
+
may have multiple applicable taxonomy labels.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
statement: The statement to classify
|
|
439
|
+
subject_canonical: Canonicalized subject entity
|
|
440
|
+
object_canonical: Canonicalized object entity
|
|
441
|
+
context: Pipeline context
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
List of TaxonomyResult objects (empty if none above threshold)
|
|
445
|
+
"""
|
|
446
|
+
...
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Canonicalizer plugins for Stage 4 (Canonicalization).
|
|
3
|
+
|
|
4
|
+
Resolves entities to their canonical forms.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .base import BaseCanonicalizerPlugin
|
|
8
|
+
from .organization import OrganizationCanonicalizer
|
|
9
|
+
from .person import PersonCanonicalizer
|
|
10
|
+
from .location import LocationCanonicalizer
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BaseCanonicalizerPlugin",
|
|
14
|
+
"OrganizationCanonicalizer",
|
|
15
|
+
"PersonCanonicalizer",
|
|
16
|
+
"LocationCanonicalizer",
|
|
17
|
+
]
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LocationCanonicalizer - Resolves location entities to canonical forms.
|
|
3
|
+
|
|
4
|
+
Uses:
|
|
5
|
+
1. ISO country code exact match
|
|
6
|
+
2. Known city/country mappings
|
|
7
|
+
3. Geohash matching for coordinates (if available)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from ..base import BaseCanonicalizerPlugin, PluginCapability
|
|
14
|
+
from ...pipeline.context import PipelineContext
|
|
15
|
+
from ...pipeline.registry import PluginRegistry
|
|
16
|
+
from ...models import QualifiedEntity, CanonicalMatch, EntityType
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Common country name variations
|
|
21
|
+
COUNTRY_ALIASES = {
|
|
22
|
+
"usa": "United States",
|
|
23
|
+
"us": "United States",
|
|
24
|
+
"united states of america": "United States",
|
|
25
|
+
"u.s.": "United States",
|
|
26
|
+
"u.s.a.": "United States",
|
|
27
|
+
"america": "United States",
|
|
28
|
+
"uk": "United Kingdom",
|
|
29
|
+
"u.k.": "United Kingdom",
|
|
30
|
+
"great britain": "United Kingdom",
|
|
31
|
+
"britain": "United Kingdom",
|
|
32
|
+
"england": "United Kingdom",
|
|
33
|
+
"uae": "United Arab Emirates",
|
|
34
|
+
"prc": "China",
|
|
35
|
+
"peoples republic of china": "China",
|
|
36
|
+
"people's republic of china": "China",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# ISO 3166-1 alpha-2 codes for common countries
|
|
40
|
+
ISO_CODES = {
|
|
41
|
+
"united states": "US",
|
|
42
|
+
"united kingdom": "GB",
|
|
43
|
+
"china": "CN",
|
|
44
|
+
"germany": "DE",
|
|
45
|
+
"france": "FR",
|
|
46
|
+
"japan": "JP",
|
|
47
|
+
"canada": "CA",
|
|
48
|
+
"australia": "AU",
|
|
49
|
+
"india": "IN",
|
|
50
|
+
"brazil": "BR",
|
|
51
|
+
"russia": "RU",
|
|
52
|
+
"italy": "IT",
|
|
53
|
+
"spain": "ES",
|
|
54
|
+
"mexico": "MX",
|
|
55
|
+
"south korea": "KR",
|
|
56
|
+
"netherlands": "NL",
|
|
57
|
+
"switzerland": "CH",
|
|
58
|
+
"singapore": "SG",
|
|
59
|
+
"hong kong": "HK",
|
|
60
|
+
"ireland": "IE",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Well-known cities to countries
|
|
64
|
+
CITY_COUNTRY_MAP = {
|
|
65
|
+
"new york": ("New York", "United States"),
|
|
66
|
+
"nyc": ("New York", "United States"),
|
|
67
|
+
"london": ("London", "United Kingdom"),
|
|
68
|
+
"paris": ("Paris", "France"),
|
|
69
|
+
"tokyo": ("Tokyo", "Japan"),
|
|
70
|
+
"beijing": ("Beijing", "China"),
|
|
71
|
+
"shanghai": ("Shanghai", "China"),
|
|
72
|
+
"san francisco": ("San Francisco", "United States"),
|
|
73
|
+
"sf": ("San Francisco", "United States"),
|
|
74
|
+
"los angeles": ("Los Angeles", "United States"),
|
|
75
|
+
"la": ("Los Angeles", "United States"),
|
|
76
|
+
"chicago": ("Chicago", "United States"),
|
|
77
|
+
"berlin": ("Berlin", "Germany"),
|
|
78
|
+
"sydney": ("Sydney", "Australia"),
|
|
79
|
+
"toronto": ("Toronto", "Canada"),
|
|
80
|
+
"singapore": ("Singapore", "Singapore"),
|
|
81
|
+
"hong kong": ("Hong Kong", "China"),
|
|
82
|
+
"mumbai": ("Mumbai", "India"),
|
|
83
|
+
"bangalore": ("Bangalore", "India"),
|
|
84
|
+
"dublin": ("Dublin", "Ireland"),
|
|
85
|
+
"amsterdam": ("Amsterdam", "Netherlands"),
|
|
86
|
+
"zurich": ("Zurich", "Switzerland"),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def normalize_location(name: str) -> str:
|
|
91
|
+
"""Normalize a location name for matching."""
|
|
92
|
+
return name.strip().lower().replace(".", "")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@PluginRegistry.canonicalizer
|
|
96
|
+
class LocationCanonicalizer(BaseCanonicalizerPlugin):
|
|
97
|
+
"""
|
|
98
|
+
Canonicalizer for location entities (GPE, LOC).
|
|
99
|
+
|
|
100
|
+
Uses standardized country codes and known city mappings.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def name(self) -> str:
|
|
105
|
+
return "location_canonicalizer"
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def priority(self) -> int:
|
|
109
|
+
return 10
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def capabilities(self) -> PluginCapability:
|
|
113
|
+
return PluginCapability.CACHING
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def description(self) -> str:
|
|
117
|
+
return "Resolves location entities using ISO codes and known mappings"
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def supported_entity_types(self) -> set[EntityType]:
|
|
121
|
+
return {EntityType.GPE, EntityType.LOC}
|
|
122
|
+
|
|
123
|
+
def find_canonical(
|
|
124
|
+
self,
|
|
125
|
+
entity: QualifiedEntity,
|
|
126
|
+
context: PipelineContext,
|
|
127
|
+
) -> Optional[CanonicalMatch]:
|
|
128
|
+
"""
|
|
129
|
+
Find canonical form for a location entity.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
entity: Qualified entity to canonicalize
|
|
133
|
+
context: Pipeline context
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
CanonicalMatch if found
|
|
137
|
+
"""
|
|
138
|
+
normalized = normalize_location(entity.original_text)
|
|
139
|
+
|
|
140
|
+
# Check country aliases
|
|
141
|
+
if normalized in COUNTRY_ALIASES:
|
|
142
|
+
canonical_name = COUNTRY_ALIASES[normalized]
|
|
143
|
+
iso_code = ISO_CODES.get(canonical_name.lower())
|
|
144
|
+
|
|
145
|
+
return CanonicalMatch(
|
|
146
|
+
canonical_id=iso_code,
|
|
147
|
+
canonical_name=canonical_name,
|
|
148
|
+
match_method="name_exact",
|
|
149
|
+
match_confidence=1.0,
|
|
150
|
+
match_details={"match_type": "country_alias"},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Check ISO codes directly
|
|
154
|
+
if normalized in ISO_CODES:
|
|
155
|
+
canonical_name = normalized.title()
|
|
156
|
+
iso_code = ISO_CODES[normalized]
|
|
157
|
+
|
|
158
|
+
return CanonicalMatch(
|
|
159
|
+
canonical_id=iso_code,
|
|
160
|
+
canonical_name=canonical_name,
|
|
161
|
+
match_method="name_exact",
|
|
162
|
+
match_confidence=1.0,
|
|
163
|
+
match_details={"match_type": "country_name"},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Check city mappings
|
|
167
|
+
if normalized in CITY_COUNTRY_MAP:
|
|
168
|
+
city_name, country_name = CITY_COUNTRY_MAP[normalized]
|
|
169
|
+
iso_code = ISO_CODES.get(country_name.lower())
|
|
170
|
+
|
|
171
|
+
return CanonicalMatch(
|
|
172
|
+
canonical_id=iso_code,
|
|
173
|
+
canonical_name=city_name,
|
|
174
|
+
match_method="name_exact",
|
|
175
|
+
match_confidence=0.95,
|
|
176
|
+
match_details={"match_type": "city_mapping", "country": country_name},
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Check qualifiers for country info
|
|
180
|
+
if entity.qualifiers.country:
|
|
181
|
+
country_normalized = normalize_location(entity.qualifiers.country)
|
|
182
|
+
if country_normalized in ISO_CODES:
|
|
183
|
+
return CanonicalMatch(
|
|
184
|
+
canonical_id=ISO_CODES[country_normalized],
|
|
185
|
+
canonical_name=entity.original_text,
|
|
186
|
+
match_method="identifier",
|
|
187
|
+
match_confidence=0.9,
|
|
188
|
+
match_details={"match_type": "qualifier_country"},
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
def format_fqn(
|
|
194
|
+
self,
|
|
195
|
+
entity: QualifiedEntity,
|
|
196
|
+
match: Optional[CanonicalMatch],
|
|
197
|
+
) -> str:
|
|
198
|
+
"""Format FQN for a location."""
|
|
199
|
+
base_name = match.canonical_name if match else entity.original_text
|
|
200
|
+
|
|
201
|
+
parts = []
|
|
202
|
+
|
|
203
|
+
# Add country if it's a city
|
|
204
|
+
if match and match.match_details:
|
|
205
|
+
country = match.match_details.get("country")
|
|
206
|
+
if country:
|
|
207
|
+
parts.append(country)
|
|
208
|
+
|
|
209
|
+
# Add ISO code
|
|
210
|
+
if match and match.canonical_id:
|
|
211
|
+
parts.append(match.canonical_id)
|
|
212
|
+
|
|
213
|
+
if parts:
|
|
214
|
+
return f"{base_name} ({', '.join(parts)})"
|
|
215
|
+
return base_name
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# Allow importing without decorator for testing
|
|
219
|
+
LocationCanonicalizerClass = LocationCanonicalizer
|