corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +460 -21
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +32 -47
  8. statement_extractor/gliner_extraction.py +218 -0
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +15 -6
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. statement_extractor/scoring.py +17 -69
  52. corp_extractor-0.3.0.dist-info/RECORD +0 -12
  53. statement_extractor/spacy_extraction.py +0 -386
  54. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  55. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,446 @@
1
+ """
2
+ Base plugin classes for the extraction pipeline.
3
+
4
+ Defines the abstract interfaces for each pipeline stage:
5
+ - BaseSplitterPlugin: Stage 1 - Text → RawTriple
6
+ - BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
7
+ - BaseQualifierPlugin: Stage 3 - Entity → EntityQualifiers
8
+ - BaseCanonicalizerPlugin: Stage 4 - QualifiedEntity → CanonicalMatch
9
+ - BaseLabelerPlugin: Stage 5 - Statement → StatementLabel
10
+ - BaseTaxonomyPlugin: Stage 6 - Statement → TaxonomyResult
11
+ """
12
+
13
+ from abc import ABC, abstractmethod
14
+ from enum import Flag, auto
15
+ from typing import TYPE_CHECKING
16
+
17
+ if TYPE_CHECKING:
18
+ from ..pipeline.context import PipelineContext
19
+ from ..models import (
20
+ RawTriple,
21
+ PipelineStatement,
22
+ ExtractedEntity,
23
+ EntityQualifiers,
24
+ QualifiedEntity,
25
+ CanonicalMatch,
26
+ CanonicalEntity,
27
+ StatementLabel,
28
+ TaxonomyResult,
29
+ EntityType,
30
+ )
31
+
32
+
33
+ class PluginCapability(Flag):
34
+ """Flags indicating plugin capabilities."""
35
+ NONE = 0
36
+ BATCH_PROCESSING = auto() # Can process multiple items at once
37
+ ASYNC_PROCESSING = auto() # Supports async execution
38
+ EXTERNAL_API = auto() # Uses external API (may have rate limits)
39
+ LLM_REQUIRED = auto() # Requires an LLM model
40
+ CACHING = auto() # Supports result caching
41
+
42
+
43
+ class BasePlugin(ABC):
44
+ """
45
+ Base class for all pipeline plugins.
46
+
47
+ All plugins must implement the name property and can optionally
48
+ override priority and capabilities.
49
+ """
50
+
51
+ @property
52
+ @abstractmethod
53
+ def name(self) -> str:
54
+ """Unique name for this plugin (used for registration and CLI)."""
55
+ ...
56
+
57
+ @property
58
+ def priority(self) -> int:
59
+ """
60
+ Plugin priority (lower = higher priority, runs first).
61
+
62
+ Default is 100. Use lower values (e.g., 10, 20) for critical plugins
63
+ that should run before others.
64
+ """
65
+ return 100
66
+
67
+ @property
68
+ def capabilities(self) -> PluginCapability:
69
+ """Plugin capabilities (flags)."""
70
+ return PluginCapability.NONE
71
+
72
+ @property
73
+ def description(self) -> str:
74
+ """Human-readable description of this plugin."""
75
+ return ""
76
+
77
+
78
+ class BaseSplitterPlugin(BasePlugin):
79
+ """
80
+ Stage 1 plugin: Split text into atomic triples.
81
+
82
+ Takes raw text and produces RawTriple objects containing
83
+ subject/predicate/object text and source sentence.
84
+ """
85
+
86
+ @abstractmethod
87
+ def split(
88
+ self,
89
+ text: str,
90
+ context: "PipelineContext",
91
+ ) -> list["RawTriple"]:
92
+ """
93
+ Split text into atomic triples.
94
+
95
+ Args:
96
+ text: Input text to split
97
+ context: Pipeline context for accessing metadata and config
98
+
99
+ Returns:
100
+ List of RawTriple objects
101
+ """
102
+ ...
103
+
104
+
105
+ class BaseExtractorPlugin(BasePlugin):
106
+ """
107
+ Stage 2 plugin: Refine triples into statements with typed entities.
108
+
109
+ Takes RawTriple objects and produces PipelineStatement objects
110
+ with ExtractedEntity subjects/objects that have types, spans,
111
+ and confidence scores.
112
+ """
113
+
114
+ @abstractmethod
115
+ def extract(
116
+ self,
117
+ raw_triples: list["RawTriple"],
118
+ context: "PipelineContext",
119
+ ) -> list["PipelineStatement"]:
120
+ """
121
+ Extract statements from raw triples.
122
+
123
+ Args:
124
+ raw_triples: Raw triples from Stage 1
125
+ context: Pipeline context
126
+
127
+ Returns:
128
+ List of PipelineStatement objects with typed entities
129
+ """
130
+ ...
131
+
132
+
133
+ class BaseQualifierPlugin(BasePlugin):
134
+ """
135
+ Stage 3 plugin: Add qualifiers and identifiers to entities.
136
+
137
+ Processes entities of specific types and adds semantic qualifiers
138
+ (role, org) or external identifiers (LEI, company number).
139
+ """
140
+
141
+ @property
142
+ @abstractmethod
143
+ def supported_entity_types(self) -> set["EntityType"]:
144
+ """Entity types this plugin can qualify (e.g., {ORG, PERSON})."""
145
+ ...
146
+
147
+ @property
148
+ def supported_identifier_types(self) -> list[str]:
149
+ """
150
+ Identifier types this plugin can use for lookup.
151
+
152
+ For example, GLEIFQualifier can lookup by 'lei'.
153
+ """
154
+ return []
155
+
156
+ @property
157
+ def provided_identifier_types(self) -> list[str]:
158
+ """
159
+ Identifier types this plugin can provide.
160
+
161
+ For example, GLEIFQualifier provides 'lei', 'jurisdiction'.
162
+ """
163
+ return []
164
+
165
+ @abstractmethod
166
+ def qualify(
167
+ self,
168
+ entity: "ExtractedEntity",
169
+ context: "PipelineContext",
170
+ ) -> "EntityQualifiers | None":
171
+ """
172
+ Add qualifiers to an entity.
173
+
174
+ Args:
175
+ entity: The entity to qualify
176
+ context: Pipeline context (for accessing source text, other entities)
177
+
178
+ Returns:
179
+ EntityQualifiers with added information, or None if nothing to add
180
+ """
181
+ ...
182
+
183
+
184
+ class BaseCanonicalizerPlugin(BasePlugin):
185
+ """
186
+ Stage 4 plugin: Resolve entities to canonical forms.
187
+
188
+ Takes qualified entities and finds their canonical representations
189
+ using various matching strategies (identifier, name, fuzzy, LLM).
190
+ """
191
+
192
+ @property
193
+ @abstractmethod
194
+ def supported_entity_types(self) -> set["EntityType"]:
195
+ """Entity types this plugin can canonicalize."""
196
+ ...
197
+
198
+ @abstractmethod
199
+ def find_canonical(
200
+ self,
201
+ entity: "QualifiedEntity",
202
+ context: "PipelineContext",
203
+ ) -> "CanonicalMatch | None":
204
+ """
205
+ Find canonical form for an entity.
206
+
207
+ Args:
208
+ entity: Qualified entity to canonicalize
209
+ context: Pipeline context
210
+
211
+ Returns:
212
+ CanonicalMatch if found, None otherwise
213
+ """
214
+ ...
215
+
216
+ def format_fqn(
217
+ self,
218
+ entity: "QualifiedEntity",
219
+ match: "CanonicalMatch | None",
220
+ ) -> str:
221
+ """
222
+ Format the fully qualified name for display.
223
+
224
+ Can be overridden by subclasses for custom formatting.
225
+ Default implementation uses CanonicalEntity._generate_fqn.
226
+ """
227
+ from ..models import CanonicalEntity
228
+ return CanonicalEntity._generate_fqn(entity, match)
229
+
230
+
231
+ class ClassificationSchema:
232
+ """
233
+ Schema for simple multi-choice classification (2-20 choices).
234
+
235
+ Handled by GLiNER2 `.classification()` in a single pass.
236
+
237
+ Examples:
238
+ - sentiment: ["positive", "negative", "neutral"]
239
+ - certainty: ["certain", "uncertain", "speculative"]
240
+ - temporality: ["past", "present", "future"]
241
+ """
242
+
243
+ def __init__(
244
+ self,
245
+ label_type: str,
246
+ choices: list[str],
247
+ description: str = "",
248
+ scope: str = "statement", # "statement", "subject", "object", "predicate"
249
+ ):
250
+ self.label_type = label_type
251
+ self.choices = choices
252
+ self.description = description
253
+ self.scope = scope
254
+
255
+ def __repr__(self) -> str:
256
+ return f"ClassificationSchema({self.label_type!r}, choices={self.choices!r})"
257
+
258
+
259
+ class TaxonomySchema:
260
+ """
261
+ Schema for large taxonomy labeling (100s of values).
262
+
263
+ Too many choices for GLiNER2 classification. Requires MNLI or similar:
264
+ - MNLI zero-shot with label descriptions
265
+ - Embedding-based nearest neighbor search
266
+ - Hierarchical classification (category → subcategory)
267
+
268
+ Examples:
269
+ - industry_code: NAICS/SIC codes (1000+ values)
270
+ - relation_type: detailed relation ontology (100+ types)
271
+ - job_title: standardized job taxonomy
272
+ """
273
+
274
+ def __init__(
275
+ self,
276
+ label_type: str,
277
+ values: list[str] | dict[str, list[str]], # flat list or hierarchical dict
278
+ description: str = "",
279
+ scope: str = "statement", # "statement", "subject", "object", "predicate"
280
+ label_descriptions: dict[str, str] | None = None, # descriptions for MNLI
281
+ ):
282
+ self.label_type = label_type
283
+ self.values = values
284
+ self.description = description
285
+ self.scope = scope
286
+ self.label_descriptions = label_descriptions # e.g., {"NAICS:5112": "Software Publishers"}
287
+
288
+ @property
289
+ def is_hierarchical(self) -> bool:
290
+ """Check if taxonomy is hierarchical (dict) vs flat (list)."""
291
+ return isinstance(self.values, dict)
292
+
293
+ @property
294
+ def all_values(self) -> list[str]:
295
+ """Get all taxonomy values (flattened if hierarchical)."""
296
+ if isinstance(self.values, list):
297
+ return self.values
298
+ # Flatten hierarchical dict
299
+ result = []
300
+ for category, subcategories in self.values.items():
301
+ result.append(category)
302
+ result.extend(subcategories)
303
+ return result
304
+
305
+ def __repr__(self) -> str:
306
+ count = len(self.all_values)
307
+ return f"TaxonomySchema({self.label_type!r}, {count} values)"
308
+
309
+
310
+ class BaseLabelerPlugin(BasePlugin):
311
+ """
312
+ Stage 5 plugin: Apply labels to statements.
313
+
314
+ Adds classification labels (sentiment, relation type, confidence)
315
+ to the final labeled statements.
316
+
317
+ Labelers can provide a classification_schema that extractors will use
318
+ to run classification in a single model pass. The results are stored
319
+ in the pipeline context for the labeler to retrieve.
320
+ """
321
+
322
+ @property
323
+ @abstractmethod
324
+ def label_type(self) -> str:
325
+ """
326
+ The type of label this plugin produces.
327
+
328
+ Examples: 'sentiment', 'relation_type', 'confidence'
329
+ """
330
+ ...
331
+
332
+ @property
333
+ def classification_schema(self) -> ClassificationSchema | None:
334
+ """
335
+ Simple multi-choice classification schema (2-20 choices).
336
+
337
+ If provided, GLiNER2 extractor will run `.classification()` and store
338
+ results in context for this labeler to retrieve.
339
+
340
+ Returns:
341
+ ClassificationSchema or None
342
+ """
343
+ return None
344
+
345
+ @property
346
+ def taxonomy_schema(self) -> TaxonomySchema | None:
347
+ """
348
+ Large taxonomy schema (100s of values).
349
+
350
+ If provided, requires MNLI or embedding-based classification.
351
+ Results stored in context for this labeler to retrieve.
352
+
353
+ Returns:
354
+ TaxonomySchema or None
355
+ """
356
+ return None
357
+
358
+ @abstractmethod
359
+ def label(
360
+ self,
361
+ statement: "PipelineStatement",
362
+ subject_canonical: "CanonicalEntity",
363
+ object_canonical: "CanonicalEntity",
364
+ context: "PipelineContext",
365
+ ) -> "StatementLabel | None":
366
+ """
367
+ Apply a label to a statement.
368
+
369
+ Args:
370
+ statement: The statement to label
371
+ subject_canonical: Canonicalized subject entity
372
+ object_canonical: Canonicalized object entity
373
+ context: Pipeline context (check context.classification_results for pre-computed labels)
374
+
375
+ Returns:
376
+ StatementLabel if applicable, None otherwise
377
+ """
378
+ ...
379
+
380
+
381
+ class BaseTaxonomyPlugin(BasePlugin):
382
+ """
383
+ Stage 6 plugin: Classify statements against a taxonomy.
384
+
385
+ Taxonomy classification is separate from labeling because:
386
+ - It operates on large taxonomies (100s-1000s of labels)
387
+ - It requires specialized models (MNLI, embeddings)
388
+ - It's computationally heavier than simple labeling
389
+
390
+ Taxonomy plugins produce TaxonomyResult objects that are stored
391
+ in the pipeline context.
392
+ """
393
+
394
+ @property
395
+ @abstractmethod
396
+ def taxonomy_name(self) -> str:
397
+ """
398
+ Name of the taxonomy this plugin classifies against.
399
+
400
+ Examples: 'esg_topics', 'industry_codes', 'relation_types'
401
+ """
402
+ ...
403
+
404
+ @property
405
+ def taxonomy_schema(self) -> TaxonomySchema | None:
406
+ """
407
+ The taxonomy schema this plugin uses.
408
+
409
+ Returns:
410
+ TaxonomySchema describing the taxonomy structure
411
+ """
412
+ return None
413
+
414
+ @property
415
+ def supported_categories(self) -> list[str]:
416
+ """
417
+ List of taxonomy categories this plugin supports.
418
+
419
+ Returns empty list if all categories are supported.
420
+ """
421
+ return []
422
+
423
+ @abstractmethod
424
+ def classify(
425
+ self,
426
+ statement: "PipelineStatement",
427
+ subject_canonical: "CanonicalEntity",
428
+ object_canonical: "CanonicalEntity",
429
+ context: "PipelineContext",
430
+ ) -> list["TaxonomyResult"]:
431
+ """
432
+ Classify a statement against the taxonomy.
433
+
434
+ Returns all labels above the confidence threshold. A single statement
435
+ may have multiple applicable taxonomy labels.
436
+
437
+ Args:
438
+ statement: The statement to classify
439
+ subject_canonical: Canonicalized subject entity
440
+ object_canonical: Canonicalized object entity
441
+ context: Pipeline context
442
+
443
+ Returns:
444
+ List of TaxonomyResult objects (empty if none above threshold)
445
+ """
446
+ ...
@@ -0,0 +1,17 @@
1
+ """
2
+ Canonicalizer plugins for Stage 4 (Canonicalization).
3
+
4
+ Resolves entities to their canonical forms.
5
+ """
6
+
7
+ from .base import BaseCanonicalizerPlugin
8
+ from .organization import OrganizationCanonicalizer
9
+ from .person import PersonCanonicalizer
10
+ from .location import LocationCanonicalizer
11
+
12
+ __all__ = [
13
+ "BaseCanonicalizerPlugin",
14
+ "OrganizationCanonicalizer",
15
+ "PersonCanonicalizer",
16
+ "LocationCanonicalizer",
17
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Base class for canonicalizer plugins.
3
+
4
+ Re-exports BaseCanonicalizerPlugin from the main plugins module.
5
+ """
6
+
7
+ from ..base import BaseCanonicalizerPlugin
8
+
9
+ __all__ = ["BaseCanonicalizerPlugin"]
@@ -0,0 +1,219 @@
1
+ """
2
+ LocationCanonicalizer - Resolves location entities to canonical forms.
3
+
4
+ Uses:
5
+ 1. ISO country code exact match
6
+ 2. Known city/country mappings
7
+ 3. Geohash matching for coordinates (if available)
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional
12
+
13
+ from ..base import BaseCanonicalizerPlugin, PluginCapability
14
+ from ...pipeline.context import PipelineContext
15
+ from ...pipeline.registry import PluginRegistry
16
+ from ...models import QualifiedEntity, CanonicalMatch, EntityType
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Common country name variations
21
+ COUNTRY_ALIASES = {
22
+ "usa": "United States",
23
+ "us": "United States",
24
+ "united states of america": "United States",
25
+ "u.s.": "United States",
26
+ "u.s.a.": "United States",
27
+ "america": "United States",
28
+ "uk": "United Kingdom",
29
+ "u.k.": "United Kingdom",
30
+ "great britain": "United Kingdom",
31
+ "britain": "United Kingdom",
32
+ "england": "United Kingdom",
33
+ "uae": "United Arab Emirates",
34
+ "prc": "China",
35
+ "peoples republic of china": "China",
36
+ "people's republic of china": "China",
37
+ }
38
+
39
+ # ISO 3166-1 alpha-2 codes for common countries
40
+ ISO_CODES = {
41
+ "united states": "US",
42
+ "united kingdom": "GB",
43
+ "china": "CN",
44
+ "germany": "DE",
45
+ "france": "FR",
46
+ "japan": "JP",
47
+ "canada": "CA",
48
+ "australia": "AU",
49
+ "india": "IN",
50
+ "brazil": "BR",
51
+ "russia": "RU",
52
+ "italy": "IT",
53
+ "spain": "ES",
54
+ "mexico": "MX",
55
+ "south korea": "KR",
56
+ "netherlands": "NL",
57
+ "switzerland": "CH",
58
+ "singapore": "SG",
59
+ "hong kong": "HK",
60
+ "ireland": "IE",
61
+ }
62
+
63
+ # Well-known cities to countries
64
+ CITY_COUNTRY_MAP = {
65
+ "new york": ("New York", "United States"),
66
+ "nyc": ("New York", "United States"),
67
+ "london": ("London", "United Kingdom"),
68
+ "paris": ("Paris", "France"),
69
+ "tokyo": ("Tokyo", "Japan"),
70
+ "beijing": ("Beijing", "China"),
71
+ "shanghai": ("Shanghai", "China"),
72
+ "san francisco": ("San Francisco", "United States"),
73
+ "sf": ("San Francisco", "United States"),
74
+ "los angeles": ("Los Angeles", "United States"),
75
+ "la": ("Los Angeles", "United States"),
76
+ "chicago": ("Chicago", "United States"),
77
+ "berlin": ("Berlin", "Germany"),
78
+ "sydney": ("Sydney", "Australia"),
79
+ "toronto": ("Toronto", "Canada"),
80
+ "singapore": ("Singapore", "Singapore"),
81
+ "hong kong": ("Hong Kong", "China"),
82
+ "mumbai": ("Mumbai", "India"),
83
+ "bangalore": ("Bangalore", "India"),
84
+ "dublin": ("Dublin", "Ireland"),
85
+ "amsterdam": ("Amsterdam", "Netherlands"),
86
+ "zurich": ("Zurich", "Switzerland"),
87
+ }
88
+
89
+
90
+ def normalize_location(name: str) -> str:
91
+ """Normalize a location name for matching."""
92
+ return name.strip().lower().replace(".", "")
93
+
94
+
95
+ @PluginRegistry.canonicalizer
96
+ class LocationCanonicalizer(BaseCanonicalizerPlugin):
97
+ """
98
+ Canonicalizer for location entities (GPE, LOC).
99
+
100
+ Uses standardized country codes and known city mappings.
101
+ """
102
+
103
+ @property
104
+ def name(self) -> str:
105
+ return "location_canonicalizer"
106
+
107
+ @property
108
+ def priority(self) -> int:
109
+ return 10
110
+
111
+ @property
112
+ def capabilities(self) -> PluginCapability:
113
+ return PluginCapability.CACHING
114
+
115
+ @property
116
+ def description(self) -> str:
117
+ return "Resolves location entities using ISO codes and known mappings"
118
+
119
+ @property
120
+ def supported_entity_types(self) -> set[EntityType]:
121
+ return {EntityType.GPE, EntityType.LOC}
122
+
123
+ def find_canonical(
124
+ self,
125
+ entity: QualifiedEntity,
126
+ context: PipelineContext,
127
+ ) -> Optional[CanonicalMatch]:
128
+ """
129
+ Find canonical form for a location entity.
130
+
131
+ Args:
132
+ entity: Qualified entity to canonicalize
133
+ context: Pipeline context
134
+
135
+ Returns:
136
+ CanonicalMatch if found
137
+ """
138
+ normalized = normalize_location(entity.original_text)
139
+
140
+ # Check country aliases
141
+ if normalized in COUNTRY_ALIASES:
142
+ canonical_name = COUNTRY_ALIASES[normalized]
143
+ iso_code = ISO_CODES.get(canonical_name.lower())
144
+
145
+ return CanonicalMatch(
146
+ canonical_id=iso_code,
147
+ canonical_name=canonical_name,
148
+ match_method="name_exact",
149
+ match_confidence=1.0,
150
+ match_details={"match_type": "country_alias"},
151
+ )
152
+
153
+ # Check ISO codes directly
154
+ if normalized in ISO_CODES:
155
+ canonical_name = normalized.title()
156
+ iso_code = ISO_CODES[normalized]
157
+
158
+ return CanonicalMatch(
159
+ canonical_id=iso_code,
160
+ canonical_name=canonical_name,
161
+ match_method="name_exact",
162
+ match_confidence=1.0,
163
+ match_details={"match_type": "country_name"},
164
+ )
165
+
166
+ # Check city mappings
167
+ if normalized in CITY_COUNTRY_MAP:
168
+ city_name, country_name = CITY_COUNTRY_MAP[normalized]
169
+ iso_code = ISO_CODES.get(country_name.lower())
170
+
171
+ return CanonicalMatch(
172
+ canonical_id=iso_code,
173
+ canonical_name=city_name,
174
+ match_method="name_exact",
175
+ match_confidence=0.95,
176
+ match_details={"match_type": "city_mapping", "country": country_name},
177
+ )
178
+
179
+ # Check qualifiers for country info
180
+ if entity.qualifiers.country:
181
+ country_normalized = normalize_location(entity.qualifiers.country)
182
+ if country_normalized in ISO_CODES:
183
+ return CanonicalMatch(
184
+ canonical_id=ISO_CODES[country_normalized],
185
+ canonical_name=entity.original_text,
186
+ match_method="identifier",
187
+ match_confidence=0.9,
188
+ match_details={"match_type": "qualifier_country"},
189
+ )
190
+
191
+ return None
192
+
193
+ def format_fqn(
194
+ self,
195
+ entity: QualifiedEntity,
196
+ match: Optional[CanonicalMatch],
197
+ ) -> str:
198
+ """Format FQN for a location."""
199
+ base_name = match.canonical_name if match else entity.original_text
200
+
201
+ parts = []
202
+
203
+ # Add country if it's a city
204
+ if match and match.match_details:
205
+ country = match.match_details.get("country")
206
+ if country:
207
+ parts.append(country)
208
+
209
+ # Add ISO code
210
+ if match and match.canonical_id:
211
+ parts.append(match.canonical_id)
212
+
213
+ if parts:
214
+ return f"{base_name} ({', '.join(parts)})"
215
+ return base_name
216
+
217
+
218
+ # Allow importing without decorator for testing
219
+ LocationCanonicalizerClass = LocationCanonicalizer