corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -1,219 +0,0 @@
1
- """
2
- LocationCanonicalizer - Resolves location entities to canonical forms.
3
-
4
- Uses:
5
- 1. ISO country code exact match
6
- 2. Known city/country mappings
7
- 3. Geohash matching for coordinates (if available)
8
- """
9
-
10
- import logging
11
- from typing import Optional
12
-
13
- from ..base import BaseCanonicalizerPlugin, PluginCapability
14
- from ...pipeline.context import PipelineContext
15
- from ...pipeline.registry import PluginRegistry
16
- from ...models import QualifiedEntity, CanonicalMatch, EntityType
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
- # Common country name variations
21
- COUNTRY_ALIASES = {
22
- "usa": "United States",
23
- "us": "United States",
24
- "united states of america": "United States",
25
- "u.s.": "United States",
26
- "u.s.a.": "United States",
27
- "america": "United States",
28
- "uk": "United Kingdom",
29
- "u.k.": "United Kingdom",
30
- "great britain": "United Kingdom",
31
- "britain": "United Kingdom",
32
- "england": "United Kingdom",
33
- "uae": "United Arab Emirates",
34
- "prc": "China",
35
- "peoples republic of china": "China",
36
- "people's republic of china": "China",
37
- }
38
-
39
- # ISO 3166-1 alpha-2 codes for common countries
40
- ISO_CODES = {
41
- "united states": "US",
42
- "united kingdom": "GB",
43
- "china": "CN",
44
- "germany": "DE",
45
- "france": "FR",
46
- "japan": "JP",
47
- "canada": "CA",
48
- "australia": "AU",
49
- "india": "IN",
50
- "brazil": "BR",
51
- "russia": "RU",
52
- "italy": "IT",
53
- "spain": "ES",
54
- "mexico": "MX",
55
- "south korea": "KR",
56
- "netherlands": "NL",
57
- "switzerland": "CH",
58
- "singapore": "SG",
59
- "hong kong": "HK",
60
- "ireland": "IE",
61
- }
62
-
63
- # Well-known cities to countries
64
- CITY_COUNTRY_MAP = {
65
- "new york": ("New York", "United States"),
66
- "nyc": ("New York", "United States"),
67
- "london": ("London", "United Kingdom"),
68
- "paris": ("Paris", "France"),
69
- "tokyo": ("Tokyo", "Japan"),
70
- "beijing": ("Beijing", "China"),
71
- "shanghai": ("Shanghai", "China"),
72
- "san francisco": ("San Francisco", "United States"),
73
- "sf": ("San Francisco", "United States"),
74
- "los angeles": ("Los Angeles", "United States"),
75
- "la": ("Los Angeles", "United States"),
76
- "chicago": ("Chicago", "United States"),
77
- "berlin": ("Berlin", "Germany"),
78
- "sydney": ("Sydney", "Australia"),
79
- "toronto": ("Toronto", "Canada"),
80
- "singapore": ("Singapore", "Singapore"),
81
- "hong kong": ("Hong Kong", "China"),
82
- "mumbai": ("Mumbai", "India"),
83
- "bangalore": ("Bangalore", "India"),
84
- "dublin": ("Dublin", "Ireland"),
85
- "amsterdam": ("Amsterdam", "Netherlands"),
86
- "zurich": ("Zurich", "Switzerland"),
87
- }
88
-
89
-
90
- def normalize_location(name: str) -> str:
91
- """Normalize a location name for matching."""
92
- return name.strip().lower().replace(".", "")
93
-
94
-
95
- @PluginRegistry.canonicalizer
96
- class LocationCanonicalizer(BaseCanonicalizerPlugin):
97
- """
98
- Canonicalizer for location entities (GPE, LOC).
99
-
100
- Uses standardized country codes and known city mappings.
101
- """
102
-
103
- @property
104
- def name(self) -> str:
105
- return "location_canonicalizer"
106
-
107
- @property
108
- def priority(self) -> int:
109
- return 10
110
-
111
- @property
112
- def capabilities(self) -> PluginCapability:
113
- return PluginCapability.CACHING
114
-
115
- @property
116
- def description(self) -> str:
117
- return "Resolves location entities using ISO codes and known mappings"
118
-
119
- @property
120
- def supported_entity_types(self) -> set[EntityType]:
121
- return {EntityType.GPE, EntityType.LOC}
122
-
123
- def find_canonical(
124
- self,
125
- entity: QualifiedEntity,
126
- context: PipelineContext,
127
- ) -> Optional[CanonicalMatch]:
128
- """
129
- Find canonical form for a location entity.
130
-
131
- Args:
132
- entity: Qualified entity to canonicalize
133
- context: Pipeline context
134
-
135
- Returns:
136
- CanonicalMatch if found
137
- """
138
- normalized = normalize_location(entity.original_text)
139
-
140
- # Check country aliases
141
- if normalized in COUNTRY_ALIASES:
142
- canonical_name = COUNTRY_ALIASES[normalized]
143
- iso_code = ISO_CODES.get(canonical_name.lower())
144
-
145
- return CanonicalMatch(
146
- canonical_id=iso_code,
147
- canonical_name=canonical_name,
148
- match_method="name_exact",
149
- match_confidence=1.0,
150
- match_details={"match_type": "country_alias"},
151
- )
152
-
153
- # Check ISO codes directly
154
- if normalized in ISO_CODES:
155
- canonical_name = normalized.title()
156
- iso_code = ISO_CODES[normalized]
157
-
158
- return CanonicalMatch(
159
- canonical_id=iso_code,
160
- canonical_name=canonical_name,
161
- match_method="name_exact",
162
- match_confidence=1.0,
163
- match_details={"match_type": "country_name"},
164
- )
165
-
166
- # Check city mappings
167
- if normalized in CITY_COUNTRY_MAP:
168
- city_name, country_name = CITY_COUNTRY_MAP[normalized]
169
- iso_code = ISO_CODES.get(country_name.lower())
170
-
171
- return CanonicalMatch(
172
- canonical_id=iso_code,
173
- canonical_name=city_name,
174
- match_method="name_exact",
175
- match_confidence=0.95,
176
- match_details={"match_type": "city_mapping", "country": country_name},
177
- )
178
-
179
- # Check qualifiers for country info
180
- if entity.qualifiers.country:
181
- country_normalized = normalize_location(entity.qualifiers.country)
182
- if country_normalized in ISO_CODES:
183
- return CanonicalMatch(
184
- canonical_id=ISO_CODES[country_normalized],
185
- canonical_name=entity.original_text,
186
- match_method="identifier",
187
- match_confidence=0.9,
188
- match_details={"match_type": "qualifier_country"},
189
- )
190
-
191
- return None
192
-
193
- def format_fqn(
194
- self,
195
- entity: QualifiedEntity,
196
- match: Optional[CanonicalMatch],
197
- ) -> str:
198
- """Format FQN for a location."""
199
- base_name = match.canonical_name if match else entity.original_text
200
-
201
- parts = []
202
-
203
- # Add country if it's a city
204
- if match and match.match_details:
205
- country = match.match_details.get("country")
206
- if country:
207
- parts.append(country)
208
-
209
- # Add ISO code
210
- if match and match.canonical_id:
211
- parts.append(match.canonical_id)
212
-
213
- if parts:
214
- return f"{base_name} ({', '.join(parts)})"
215
- return base_name
216
-
217
-
218
- # Allow importing without decorator for testing
219
- LocationCanonicalizerClass = LocationCanonicalizer
@@ -1,230 +0,0 @@
1
- """
2
- OrganizationCanonicalizer - Resolves ORG entities to canonical forms.
3
-
4
- Uses a tiered matching approach:
5
- 1. LEI exact match (confidence 1.0)
6
- 2. Company number + jurisdiction (confidence 0.95)
7
- 3. Trigram fuzzy name match (confidence 0.85+)
8
- 4. LLM verification for uncertain matches (confidence 0.6-0.85)
9
- """
10
-
11
- import logging
12
- import re
13
- from typing import Optional
14
-
15
- from ..base import BaseCanonicalizerPlugin, PluginCapability
16
- from ...pipeline.context import PipelineContext
17
- from ...pipeline.registry import PluginRegistry
18
- from ...models import QualifiedEntity, CanonicalMatch, EntityType
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
- # Common organization suffixes to normalize
23
- ORG_SUFFIXES = [
24
- r'\s+Inc\.?$', r'\s+Corp\.?$', r'\s+Corporation$',
25
- r'\s+Ltd\.?$', r'\s+Limited$', r'\s+LLC$',
26
- r'\s+LLP$', r'\s+PLC$', r'\s+Co\.?$',
27
- r'\s+Company$', r'\s+Group$', r'\s+Holdings$',
28
- r'\s+&\s+Co\.?$', r',\s+Inc\.?$',
29
- ]
30
-
31
-
32
- def normalize_org_name(name: str) -> str:
33
- """Normalize organization name for matching."""
34
- normalized = name.strip()
35
- for pattern in ORG_SUFFIXES:
36
- normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE)
37
- # Strip trailing punctuation (commas, periods) that may be left after suffix removal
38
- normalized = re.sub(r'[,.\s]+$', '', normalized)
39
- return normalized.strip().lower()
40
-
41
-
42
- def trigram_similarity(a: str, b: str) -> float:
43
- """Calculate trigram similarity between two strings."""
44
- if not a or not b:
45
- return 0.0
46
-
47
- def get_trigrams(s: str) -> set:
48
- s = s.lower()
49
- return {s[i:i+3] for i in range(len(s) - 2)} if len(s) >= 3 else {s}
50
-
51
- trigrams_a = get_trigrams(a)
52
- trigrams_b = get_trigrams(b)
53
-
54
- if not trigrams_a or not trigrams_b:
55
- return 0.0
56
-
57
- intersection = len(trigrams_a & trigrams_b)
58
- union = len(trigrams_a | trigrams_b)
59
-
60
- return intersection / union if union > 0 else 0.0
61
-
62
-
63
- @PluginRegistry.canonicalizer
64
- class OrganizationCanonicalizer(BaseCanonicalizerPlugin):
65
- """
66
- Canonicalizer for ORG entities.
67
-
68
- Uses tiered matching approach with identifier, name, and fuzzy matching.
69
- """
70
-
71
- def __init__(
72
- self,
73
- fuzzy_threshold: float = 0.75,
74
- use_llm_verification: bool = False,
75
- ):
76
- """
77
- Initialize the organization canonicalizer.
78
-
79
- Args:
80
- fuzzy_threshold: Minimum trigram similarity for fuzzy matches
81
- use_llm_verification: Whether to use LLM for uncertain matches
82
- """
83
- self._fuzzy_threshold = fuzzy_threshold
84
- self._use_llm_verification = use_llm_verification
85
-
86
- @property
87
- def name(self) -> str:
88
- return "organization_canonicalizer"
89
-
90
- @property
91
- def priority(self) -> int:
92
- return 10
93
-
94
- @property
95
- def capabilities(self) -> PluginCapability:
96
- caps = PluginCapability.CACHING
97
- if self._use_llm_verification:
98
- caps |= PluginCapability.LLM_REQUIRED
99
- return caps
100
-
101
- @property
102
- def description(self) -> str:
103
- return "Resolves ORG entities to canonical forms using identifier and name matching"
104
-
105
- @property
106
- def supported_entity_types(self) -> set[EntityType]:
107
- return {EntityType.ORG}
108
-
109
- def find_canonical(
110
- self,
111
- entity: QualifiedEntity,
112
- context: PipelineContext,
113
- ) -> Optional[CanonicalMatch]:
114
- """
115
- Find canonical form for an ORG entity.
116
-
117
- Args:
118
- entity: Qualified entity to canonicalize
119
- context: Pipeline context
120
-
121
- Returns:
122
- CanonicalMatch if found
123
- """
124
- qualifiers = entity.qualifiers
125
- identifiers = qualifiers.identifiers
126
-
127
- # Tier 1: LEI exact match
128
- if "lei" in identifiers:
129
- return CanonicalMatch(
130
- canonical_id=identifiers["lei"],
131
- canonical_name=entity.original_text,
132
- match_method="identifier",
133
- match_confidence=1.0,
134
- match_details={"identifier_type": "lei"},
135
- )
136
-
137
- # Tier 2: Company number + jurisdiction
138
- if "ch_number" in identifiers and qualifiers.jurisdiction:
139
- return CanonicalMatch(
140
- canonical_id=f"{qualifiers.jurisdiction}:{identifiers['ch_number']}",
141
- canonical_name=entity.original_text,
142
- match_method="identifier",
143
- match_confidence=0.95,
144
- match_details={"identifier_type": "ch_number", "jurisdiction": qualifiers.jurisdiction},
145
- )
146
-
147
- if "sec_cik" in identifiers:
148
- ticker = identifiers.get("ticker", "")
149
- canonical_name = f"{entity.original_text} ({ticker})" if ticker else entity.original_text
150
- return CanonicalMatch(
151
- canonical_id=f"SEC:{identifiers['sec_cik']}",
152
- canonical_name=canonical_name,
153
- match_method="identifier",
154
- match_confidence=0.95,
155
- match_details={"identifier_type": "sec_cik", "ticker": ticker},
156
- )
157
-
158
- # Tier 3: Fuzzy name match against other ORG entities in context
159
- best_match = self._find_fuzzy_match(entity, context)
160
- if best_match:
161
- return best_match
162
-
163
- # No canonical match found
164
- return None
165
-
166
- def _find_fuzzy_match(
167
- self,
168
- entity: QualifiedEntity,
169
- context: PipelineContext,
170
- ) -> Optional[CanonicalMatch]:
171
- """Find fuzzy matches against other ORG entities in context."""
172
- normalized_name = normalize_org_name(entity.original_text)
173
-
174
- best_match = None
175
- best_similarity = 0.0
176
-
177
- for other_ref, other_entity in context.qualified_entities.items():
178
- if other_ref == entity.entity_ref:
179
- continue
180
-
181
- if other_entity.entity_type != EntityType.ORG:
182
- continue
183
-
184
- other_normalized = normalize_org_name(other_entity.original_text)
185
- similarity = trigram_similarity(normalized_name, other_normalized)
186
-
187
- if similarity > best_similarity and similarity >= self._fuzzy_threshold:
188
- best_similarity = similarity
189
- best_match = other_entity
190
-
191
- if best_match and best_similarity >= self._fuzzy_threshold:
192
- confidence = 0.85 + (best_similarity - self._fuzzy_threshold) * 0.1
193
- confidence = min(confidence, 0.95)
194
-
195
- return CanonicalMatch(
196
- canonical_id=None,
197
- canonical_name=best_match.original_text,
198
- match_method="name_fuzzy",
199
- match_confidence=confidence,
200
- match_details={"similarity": best_similarity, "matched_entity": best_match.entity_ref},
201
- )
202
-
203
- return None
204
-
205
- def format_fqn(
206
- self,
207
- entity: QualifiedEntity,
208
- match: Optional[CanonicalMatch],
209
- ) -> str:
210
- """Format FQN for an organization."""
211
- base_name = match.canonical_name if match else entity.original_text
212
-
213
- parts = []
214
- identifiers = entity.qualifiers.identifiers
215
-
216
- # Add ticker if available
217
- if "ticker" in identifiers:
218
- parts.append(identifiers["ticker"])
219
-
220
- # Add jurisdiction
221
- if entity.qualifiers.jurisdiction:
222
- parts.append(entity.qualifiers.jurisdiction)
223
-
224
- if parts:
225
- return f"{base_name} ({', '.join(parts)})"
226
- return base_name
227
-
228
-
229
- # Allow importing without decorator for testing
230
- OrganizationCanonicalizerClass = OrganizationCanonicalizer
@@ -1,242 +0,0 @@
1
- """
2
- PersonCanonicalizer - Resolves PERSON entities to canonical forms.
3
-
4
- Uses:
5
- 1. Name variants (Tim vs Timothy)
6
- 2. Role + org context matching
7
- 3. LLM identity verification for uncertain matches
8
- """
9
-
10
- import logging
11
- import re
12
- from typing import Optional
13
-
14
- from ..base import BaseCanonicalizerPlugin, PluginCapability
15
- from ...pipeline.context import PipelineContext
16
- from ...pipeline.registry import PluginRegistry
17
- from ...models import QualifiedEntity, CanonicalMatch, EntityType
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
- # Common name variations
22
- NAME_VARIANTS = {
23
- "tim": ["timothy"],
24
- "timothy": ["tim"],
25
- "mike": ["michael"],
26
- "michael": ["mike"],
27
- "bob": ["robert"],
28
- "robert": ["bob", "rob"],
29
- "rob": ["robert"],
30
- "bill": ["william"],
31
- "william": ["bill", "will"],
32
- "will": ["william"],
33
- "jim": ["james"],
34
- "james": ["jim", "jimmy"],
35
- "jimmy": ["james"],
36
- "tom": ["thomas"],
37
- "thomas": ["tom", "tommy"],
38
- "joe": ["joseph"],
39
- "joseph": ["joe"],
40
- "alex": ["alexander", "alexandra"],
41
- "alexander": ["alex"],
42
- "alexandra": ["alex"],
43
- "dan": ["daniel"],
44
- "daniel": ["dan", "danny"],
45
- "dave": ["david"],
46
- "david": ["dave"],
47
- "ed": ["edward", "edwin"],
48
- "edward": ["ed", "eddie"],
49
- "jen": ["jennifer"],
50
- "jennifer": ["jen", "jenny"],
51
- "kate": ["katherine", "catherine"],
52
- "katherine": ["kate", "kathy"],
53
- "catherine": ["kate", "cathy"],
54
- "chris": ["christopher", "christine"],
55
- "christopher": ["chris"],
56
- "christine": ["chris"],
57
- "matt": ["matthew"],
58
- "matthew": ["matt"],
59
- "nick": ["nicholas"],
60
- "nicholas": ["nick"],
61
- "sam": ["samuel", "samantha"],
62
- "samuel": ["sam"],
63
- "samantha": ["sam"],
64
- "steve": ["steven", "stephen"],
65
- "steven": ["steve"],
66
- "stephen": ["steve"],
67
- }
68
-
69
-
70
- def normalize_person_name(name: str) -> str:
71
- """Normalize a person name for matching."""
72
- # Remove titles
73
- name = re.sub(r'^(Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Prof\.?)\s+', '', name, flags=re.IGNORECASE)
74
- # Remove suffixes
75
- name = re.sub(r'\s+(Jr\.?|Sr\.?|III?|IV|V)$', '', name, flags=re.IGNORECASE)
76
- return name.strip().lower()
77
-
78
-
79
- def get_name_parts(name: str) -> tuple[str, str]:
80
- """Split name into first and last name."""
81
- normalized = normalize_person_name(name)
82
- parts = normalized.split()
83
- if len(parts) >= 2:
84
- return parts[0], parts[-1]
85
- elif len(parts) == 1:
86
- return parts[0], ""
87
- return "", ""
88
-
89
-
90
- def names_match(name1: str, name2: str) -> tuple[bool, float, bool]:
91
- """
92
- Check if two names match, considering variants.
93
-
94
- Returns (matches, confidence, is_variant).
95
- """
96
- first1, last1 = get_name_parts(name1)
97
- first2, last2 = get_name_parts(name2)
98
-
99
- # Last names must match (if both present)
100
- if last1 and last2 and last1 != last2:
101
- return False, 0.0, False
102
-
103
- # Check first name match
104
- if first1 == first2:
105
- return True, 1.0, False
106
-
107
- # Check variants
108
- variants1 = NAME_VARIANTS.get(first1, [])
109
- variants2 = NAME_VARIANTS.get(first2, [])
110
-
111
- if first2 in variants1 or first1 in variants2:
112
- return True, 0.9, True
113
-
114
- return False, 0.0, False
115
-
116
-
117
- @PluginRegistry.canonicalizer
118
- class PersonCanonicalizer(BaseCanonicalizerPlugin):
119
- """
120
- Canonicalizer for PERSON entities.
121
-
122
- Uses name variants and context matching.
123
- """
124
-
125
- def __init__(
126
- self,
127
- use_context_matching: bool = True,
128
- ):
129
- """
130
- Initialize the person canonicalizer.
131
-
132
- Args:
133
- use_context_matching: Whether to use role+org for disambiguation
134
- """
135
- self._use_context_matching = use_context_matching
136
-
137
- @property
138
- def name(self) -> str:
139
- return "person_canonicalizer"
140
-
141
- @property
142
- def priority(self) -> int:
143
- return 10
144
-
145
- @property
146
- def capabilities(self) -> PluginCapability:
147
- return PluginCapability.CACHING
148
-
149
- @property
150
- def description(self) -> str:
151
- return "Resolves PERSON entities using name variants and context"
152
-
153
- @property
154
- def supported_entity_types(self) -> set[EntityType]:
155
- return {EntityType.PERSON}
156
-
157
- def find_canonical(
158
- self,
159
- entity: QualifiedEntity,
160
- context: PipelineContext,
161
- ) -> Optional[CanonicalMatch]:
162
- """
163
- Find canonical form for a PERSON entity.
164
-
165
- Args:
166
- entity: Qualified entity to canonicalize
167
- context: Pipeline context
168
-
169
- Returns:
170
- CanonicalMatch if found
171
- """
172
- # Look for matching PERSON entities in context
173
- best_match = None
174
- best_confidence = 0.0
175
-
176
- best_is_variant = False
177
-
178
- for other_ref, other_entity in context.qualified_entities.items():
179
- if other_ref == entity.entity_ref:
180
- continue
181
-
182
- if other_entity.entity_type != EntityType.PERSON:
183
- continue
184
-
185
- # Check name match
186
- matches, confidence, is_variant = names_match(entity.original_text, other_entity.original_text)
187
- if not matches:
188
- continue
189
-
190
- # Boost confidence if role+org also match
191
- if self._use_context_matching and confidence > 0:
192
- my_qualifiers = entity.qualifiers
193
- other_qualifiers = other_entity.qualifiers
194
-
195
- if my_qualifiers.role and other_qualifiers.role:
196
- if my_qualifiers.role.lower() == other_qualifiers.role.lower():
197
- confidence = min(confidence + 0.05, 1.0)
198
-
199
- if my_qualifiers.org and other_qualifiers.org:
200
- if my_qualifiers.org.lower() == other_qualifiers.org.lower():
201
- confidence = min(confidence + 0.05, 1.0)
202
-
203
- if confidence > best_confidence:
204
- best_confidence = confidence
205
- best_match = other_entity
206
- best_is_variant = is_variant
207
-
208
- if best_match and best_confidence >= 0.8:
209
- return CanonicalMatch(
210
- canonical_id=None,
211
- canonical_name=best_match.original_text,
212
- match_method="name_variant" if best_is_variant else "name_exact",
213
- match_confidence=best_confidence,
214
- match_details={"matched_entity": best_match.entity_ref},
215
- )
216
-
217
- return None
218
-
219
- def format_fqn(
220
- self,
221
- entity: QualifiedEntity,
222
- match: Optional[CanonicalMatch],
223
- ) -> str:
224
- """Format FQN for a person."""
225
- base_name = match.canonical_name if match else entity.original_text
226
-
227
- parts = []
228
- qualifiers = entity.qualifiers
229
-
230
- if qualifiers.role:
231
- parts.append(qualifiers.role)
232
-
233
- if qualifiers.org:
234
- parts.append(qualifiers.org)
235
-
236
- if parts:
237
- return f"{base_name} ({', '.join(parts)})"
238
- return base_name
239
-
240
-
241
- # Allow importing without decorator for testing
242
- PersonCanonicalizerClass = PersonCanonicalizer