corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
LocationCanonicalizer - Resolves location entities to canonical forms.
|
|
3
|
-
|
|
4
|
-
Uses:
|
|
5
|
-
1. ISO country code exact match
|
|
6
|
-
2. Known city/country mappings
|
|
7
|
-
3. Geohash matching for coordinates (if available)
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import logging
|
|
11
|
-
from typing import Optional
|
|
12
|
-
|
|
13
|
-
from ..base import BaseCanonicalizerPlugin, PluginCapability
|
|
14
|
-
from ...pipeline.context import PipelineContext
|
|
15
|
-
from ...pipeline.registry import PluginRegistry
|
|
16
|
-
from ...models import QualifiedEntity, CanonicalMatch, EntityType
|
|
17
|
-
|
|
18
|
-
logger = logging.getLogger(__name__)
|
|
19
|
-
|
|
20
|
-
# Common country name variations
|
|
21
|
-
COUNTRY_ALIASES = {
|
|
22
|
-
"usa": "United States",
|
|
23
|
-
"us": "United States",
|
|
24
|
-
"united states of america": "United States",
|
|
25
|
-
"u.s.": "United States",
|
|
26
|
-
"u.s.a.": "United States",
|
|
27
|
-
"america": "United States",
|
|
28
|
-
"uk": "United Kingdom",
|
|
29
|
-
"u.k.": "United Kingdom",
|
|
30
|
-
"great britain": "United Kingdom",
|
|
31
|
-
"britain": "United Kingdom",
|
|
32
|
-
"england": "United Kingdom",
|
|
33
|
-
"uae": "United Arab Emirates",
|
|
34
|
-
"prc": "China",
|
|
35
|
-
"peoples republic of china": "China",
|
|
36
|
-
"people's republic of china": "China",
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
# ISO 3166-1 alpha-2 codes for common countries
|
|
40
|
-
ISO_CODES = {
|
|
41
|
-
"united states": "US",
|
|
42
|
-
"united kingdom": "GB",
|
|
43
|
-
"china": "CN",
|
|
44
|
-
"germany": "DE",
|
|
45
|
-
"france": "FR",
|
|
46
|
-
"japan": "JP",
|
|
47
|
-
"canada": "CA",
|
|
48
|
-
"australia": "AU",
|
|
49
|
-
"india": "IN",
|
|
50
|
-
"brazil": "BR",
|
|
51
|
-
"russia": "RU",
|
|
52
|
-
"italy": "IT",
|
|
53
|
-
"spain": "ES",
|
|
54
|
-
"mexico": "MX",
|
|
55
|
-
"south korea": "KR",
|
|
56
|
-
"netherlands": "NL",
|
|
57
|
-
"switzerland": "CH",
|
|
58
|
-
"singapore": "SG",
|
|
59
|
-
"hong kong": "HK",
|
|
60
|
-
"ireland": "IE",
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
# Well-known cities to countries
|
|
64
|
-
CITY_COUNTRY_MAP = {
|
|
65
|
-
"new york": ("New York", "United States"),
|
|
66
|
-
"nyc": ("New York", "United States"),
|
|
67
|
-
"london": ("London", "United Kingdom"),
|
|
68
|
-
"paris": ("Paris", "France"),
|
|
69
|
-
"tokyo": ("Tokyo", "Japan"),
|
|
70
|
-
"beijing": ("Beijing", "China"),
|
|
71
|
-
"shanghai": ("Shanghai", "China"),
|
|
72
|
-
"san francisco": ("San Francisco", "United States"),
|
|
73
|
-
"sf": ("San Francisco", "United States"),
|
|
74
|
-
"los angeles": ("Los Angeles", "United States"),
|
|
75
|
-
"la": ("Los Angeles", "United States"),
|
|
76
|
-
"chicago": ("Chicago", "United States"),
|
|
77
|
-
"berlin": ("Berlin", "Germany"),
|
|
78
|
-
"sydney": ("Sydney", "Australia"),
|
|
79
|
-
"toronto": ("Toronto", "Canada"),
|
|
80
|
-
"singapore": ("Singapore", "Singapore"),
|
|
81
|
-
"hong kong": ("Hong Kong", "China"),
|
|
82
|
-
"mumbai": ("Mumbai", "India"),
|
|
83
|
-
"bangalore": ("Bangalore", "India"),
|
|
84
|
-
"dublin": ("Dublin", "Ireland"),
|
|
85
|
-
"amsterdam": ("Amsterdam", "Netherlands"),
|
|
86
|
-
"zurich": ("Zurich", "Switzerland"),
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def normalize_location(name: str) -> str:
|
|
91
|
-
"""Normalize a location name for matching."""
|
|
92
|
-
return name.strip().lower().replace(".", "")
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
@PluginRegistry.canonicalizer
|
|
96
|
-
class LocationCanonicalizer(BaseCanonicalizerPlugin):
|
|
97
|
-
"""
|
|
98
|
-
Canonicalizer for location entities (GPE, LOC).
|
|
99
|
-
|
|
100
|
-
Uses standardized country codes and known city mappings.
|
|
101
|
-
"""
|
|
102
|
-
|
|
103
|
-
@property
|
|
104
|
-
def name(self) -> str:
|
|
105
|
-
return "location_canonicalizer"
|
|
106
|
-
|
|
107
|
-
@property
|
|
108
|
-
def priority(self) -> int:
|
|
109
|
-
return 10
|
|
110
|
-
|
|
111
|
-
@property
|
|
112
|
-
def capabilities(self) -> PluginCapability:
|
|
113
|
-
return PluginCapability.CACHING
|
|
114
|
-
|
|
115
|
-
@property
|
|
116
|
-
def description(self) -> str:
|
|
117
|
-
return "Resolves location entities using ISO codes and known mappings"
|
|
118
|
-
|
|
119
|
-
@property
|
|
120
|
-
def supported_entity_types(self) -> set[EntityType]:
|
|
121
|
-
return {EntityType.GPE, EntityType.LOC}
|
|
122
|
-
|
|
123
|
-
def find_canonical(
|
|
124
|
-
self,
|
|
125
|
-
entity: QualifiedEntity,
|
|
126
|
-
context: PipelineContext,
|
|
127
|
-
) -> Optional[CanonicalMatch]:
|
|
128
|
-
"""
|
|
129
|
-
Find canonical form for a location entity.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
entity: Qualified entity to canonicalize
|
|
133
|
-
context: Pipeline context
|
|
134
|
-
|
|
135
|
-
Returns:
|
|
136
|
-
CanonicalMatch if found
|
|
137
|
-
"""
|
|
138
|
-
normalized = normalize_location(entity.original_text)
|
|
139
|
-
|
|
140
|
-
# Check country aliases
|
|
141
|
-
if normalized in COUNTRY_ALIASES:
|
|
142
|
-
canonical_name = COUNTRY_ALIASES[normalized]
|
|
143
|
-
iso_code = ISO_CODES.get(canonical_name.lower())
|
|
144
|
-
|
|
145
|
-
return CanonicalMatch(
|
|
146
|
-
canonical_id=iso_code,
|
|
147
|
-
canonical_name=canonical_name,
|
|
148
|
-
match_method="name_exact",
|
|
149
|
-
match_confidence=1.0,
|
|
150
|
-
match_details={"match_type": "country_alias"},
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# Check ISO codes directly
|
|
154
|
-
if normalized in ISO_CODES:
|
|
155
|
-
canonical_name = normalized.title()
|
|
156
|
-
iso_code = ISO_CODES[normalized]
|
|
157
|
-
|
|
158
|
-
return CanonicalMatch(
|
|
159
|
-
canonical_id=iso_code,
|
|
160
|
-
canonical_name=canonical_name,
|
|
161
|
-
match_method="name_exact",
|
|
162
|
-
match_confidence=1.0,
|
|
163
|
-
match_details={"match_type": "country_name"},
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
# Check city mappings
|
|
167
|
-
if normalized in CITY_COUNTRY_MAP:
|
|
168
|
-
city_name, country_name = CITY_COUNTRY_MAP[normalized]
|
|
169
|
-
iso_code = ISO_CODES.get(country_name.lower())
|
|
170
|
-
|
|
171
|
-
return CanonicalMatch(
|
|
172
|
-
canonical_id=iso_code,
|
|
173
|
-
canonical_name=city_name,
|
|
174
|
-
match_method="name_exact",
|
|
175
|
-
match_confidence=0.95,
|
|
176
|
-
match_details={"match_type": "city_mapping", "country": country_name},
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
# Check qualifiers for country info
|
|
180
|
-
if entity.qualifiers.country:
|
|
181
|
-
country_normalized = normalize_location(entity.qualifiers.country)
|
|
182
|
-
if country_normalized in ISO_CODES:
|
|
183
|
-
return CanonicalMatch(
|
|
184
|
-
canonical_id=ISO_CODES[country_normalized],
|
|
185
|
-
canonical_name=entity.original_text,
|
|
186
|
-
match_method="identifier",
|
|
187
|
-
match_confidence=0.9,
|
|
188
|
-
match_details={"match_type": "qualifier_country"},
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
return None
|
|
192
|
-
|
|
193
|
-
def format_fqn(
|
|
194
|
-
self,
|
|
195
|
-
entity: QualifiedEntity,
|
|
196
|
-
match: Optional[CanonicalMatch],
|
|
197
|
-
) -> str:
|
|
198
|
-
"""Format FQN for a location."""
|
|
199
|
-
base_name = match.canonical_name if match else entity.original_text
|
|
200
|
-
|
|
201
|
-
parts = []
|
|
202
|
-
|
|
203
|
-
# Add country if it's a city
|
|
204
|
-
if match and match.match_details:
|
|
205
|
-
country = match.match_details.get("country")
|
|
206
|
-
if country:
|
|
207
|
-
parts.append(country)
|
|
208
|
-
|
|
209
|
-
# Add ISO code
|
|
210
|
-
if match and match.canonical_id:
|
|
211
|
-
parts.append(match.canonical_id)
|
|
212
|
-
|
|
213
|
-
if parts:
|
|
214
|
-
return f"{base_name} ({', '.join(parts)})"
|
|
215
|
-
return base_name
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
# Allow importing without decorator for testing
|
|
219
|
-
LocationCanonicalizerClass = LocationCanonicalizer
|
|
@@ -1,230 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
OrganizationCanonicalizer - Resolves ORG entities to canonical forms.
|
|
3
|
-
|
|
4
|
-
Uses a tiered matching approach:
|
|
5
|
-
1. LEI exact match (confidence 1.0)
|
|
6
|
-
2. Company number + jurisdiction (confidence 0.95)
|
|
7
|
-
3. Trigram fuzzy name match (confidence 0.85+)
|
|
8
|
-
4. LLM verification for uncertain matches (confidence 0.6-0.85)
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import re
|
|
13
|
-
from typing import Optional
|
|
14
|
-
|
|
15
|
-
from ..base import BaseCanonicalizerPlugin, PluginCapability
|
|
16
|
-
from ...pipeline.context import PipelineContext
|
|
17
|
-
from ...pipeline.registry import PluginRegistry
|
|
18
|
-
from ...models import QualifiedEntity, CanonicalMatch, EntityType
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
# Common organization suffixes to normalize
|
|
23
|
-
ORG_SUFFIXES = [
|
|
24
|
-
r'\s+Inc\.?$', r'\s+Corp\.?$', r'\s+Corporation$',
|
|
25
|
-
r'\s+Ltd\.?$', r'\s+Limited$', r'\s+LLC$',
|
|
26
|
-
r'\s+LLP$', r'\s+PLC$', r'\s+Co\.?$',
|
|
27
|
-
r'\s+Company$', r'\s+Group$', r'\s+Holdings$',
|
|
28
|
-
r'\s+&\s+Co\.?$', r',\s+Inc\.?$',
|
|
29
|
-
]
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def normalize_org_name(name: str) -> str:
|
|
33
|
-
"""Normalize organization name for matching."""
|
|
34
|
-
normalized = name.strip()
|
|
35
|
-
for pattern in ORG_SUFFIXES:
|
|
36
|
-
normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE)
|
|
37
|
-
# Strip trailing punctuation (commas, periods) that may be left after suffix removal
|
|
38
|
-
normalized = re.sub(r'[,.\s]+$', '', normalized)
|
|
39
|
-
return normalized.strip().lower()
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def trigram_similarity(a: str, b: str) -> float:
|
|
43
|
-
"""Calculate trigram similarity between two strings."""
|
|
44
|
-
if not a or not b:
|
|
45
|
-
return 0.0
|
|
46
|
-
|
|
47
|
-
def get_trigrams(s: str) -> set:
|
|
48
|
-
s = s.lower()
|
|
49
|
-
return {s[i:i+3] for i in range(len(s) - 2)} if len(s) >= 3 else {s}
|
|
50
|
-
|
|
51
|
-
trigrams_a = get_trigrams(a)
|
|
52
|
-
trigrams_b = get_trigrams(b)
|
|
53
|
-
|
|
54
|
-
if not trigrams_a or not trigrams_b:
|
|
55
|
-
return 0.0
|
|
56
|
-
|
|
57
|
-
intersection = len(trigrams_a & trigrams_b)
|
|
58
|
-
union = len(trigrams_a | trigrams_b)
|
|
59
|
-
|
|
60
|
-
return intersection / union if union > 0 else 0.0
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@PluginRegistry.canonicalizer
|
|
64
|
-
class OrganizationCanonicalizer(BaseCanonicalizerPlugin):
|
|
65
|
-
"""
|
|
66
|
-
Canonicalizer for ORG entities.
|
|
67
|
-
|
|
68
|
-
Uses tiered matching approach with identifier, name, and fuzzy matching.
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
def __init__(
|
|
72
|
-
self,
|
|
73
|
-
fuzzy_threshold: float = 0.75,
|
|
74
|
-
use_llm_verification: bool = False,
|
|
75
|
-
):
|
|
76
|
-
"""
|
|
77
|
-
Initialize the organization canonicalizer.
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
fuzzy_threshold: Minimum trigram similarity for fuzzy matches
|
|
81
|
-
use_llm_verification: Whether to use LLM for uncertain matches
|
|
82
|
-
"""
|
|
83
|
-
self._fuzzy_threshold = fuzzy_threshold
|
|
84
|
-
self._use_llm_verification = use_llm_verification
|
|
85
|
-
|
|
86
|
-
@property
|
|
87
|
-
def name(self) -> str:
|
|
88
|
-
return "organization_canonicalizer"
|
|
89
|
-
|
|
90
|
-
@property
|
|
91
|
-
def priority(self) -> int:
|
|
92
|
-
return 10
|
|
93
|
-
|
|
94
|
-
@property
|
|
95
|
-
def capabilities(self) -> PluginCapability:
|
|
96
|
-
caps = PluginCapability.CACHING
|
|
97
|
-
if self._use_llm_verification:
|
|
98
|
-
caps |= PluginCapability.LLM_REQUIRED
|
|
99
|
-
return caps
|
|
100
|
-
|
|
101
|
-
@property
|
|
102
|
-
def description(self) -> str:
|
|
103
|
-
return "Resolves ORG entities to canonical forms using identifier and name matching"
|
|
104
|
-
|
|
105
|
-
@property
|
|
106
|
-
def supported_entity_types(self) -> set[EntityType]:
|
|
107
|
-
return {EntityType.ORG}
|
|
108
|
-
|
|
109
|
-
def find_canonical(
|
|
110
|
-
self,
|
|
111
|
-
entity: QualifiedEntity,
|
|
112
|
-
context: PipelineContext,
|
|
113
|
-
) -> Optional[CanonicalMatch]:
|
|
114
|
-
"""
|
|
115
|
-
Find canonical form for an ORG entity.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
entity: Qualified entity to canonicalize
|
|
119
|
-
context: Pipeline context
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
CanonicalMatch if found
|
|
123
|
-
"""
|
|
124
|
-
qualifiers = entity.qualifiers
|
|
125
|
-
identifiers = qualifiers.identifiers
|
|
126
|
-
|
|
127
|
-
# Tier 1: LEI exact match
|
|
128
|
-
if "lei" in identifiers:
|
|
129
|
-
return CanonicalMatch(
|
|
130
|
-
canonical_id=identifiers["lei"],
|
|
131
|
-
canonical_name=entity.original_text,
|
|
132
|
-
match_method="identifier",
|
|
133
|
-
match_confidence=1.0,
|
|
134
|
-
match_details={"identifier_type": "lei"},
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# Tier 2: Company number + jurisdiction
|
|
138
|
-
if "ch_number" in identifiers and qualifiers.jurisdiction:
|
|
139
|
-
return CanonicalMatch(
|
|
140
|
-
canonical_id=f"{qualifiers.jurisdiction}:{identifiers['ch_number']}",
|
|
141
|
-
canonical_name=entity.original_text,
|
|
142
|
-
match_method="identifier",
|
|
143
|
-
match_confidence=0.95,
|
|
144
|
-
match_details={"identifier_type": "ch_number", "jurisdiction": qualifiers.jurisdiction},
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
if "sec_cik" in identifiers:
|
|
148
|
-
ticker = identifiers.get("ticker", "")
|
|
149
|
-
canonical_name = f"{entity.original_text} ({ticker})" if ticker else entity.original_text
|
|
150
|
-
return CanonicalMatch(
|
|
151
|
-
canonical_id=f"SEC:{identifiers['sec_cik']}",
|
|
152
|
-
canonical_name=canonical_name,
|
|
153
|
-
match_method="identifier",
|
|
154
|
-
match_confidence=0.95,
|
|
155
|
-
match_details={"identifier_type": "sec_cik", "ticker": ticker},
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
# Tier 3: Fuzzy name match against other ORG entities in context
|
|
159
|
-
best_match = self._find_fuzzy_match(entity, context)
|
|
160
|
-
if best_match:
|
|
161
|
-
return best_match
|
|
162
|
-
|
|
163
|
-
# No canonical match found
|
|
164
|
-
return None
|
|
165
|
-
|
|
166
|
-
def _find_fuzzy_match(
|
|
167
|
-
self,
|
|
168
|
-
entity: QualifiedEntity,
|
|
169
|
-
context: PipelineContext,
|
|
170
|
-
) -> Optional[CanonicalMatch]:
|
|
171
|
-
"""Find fuzzy matches against other ORG entities in context."""
|
|
172
|
-
normalized_name = normalize_org_name(entity.original_text)
|
|
173
|
-
|
|
174
|
-
best_match = None
|
|
175
|
-
best_similarity = 0.0
|
|
176
|
-
|
|
177
|
-
for other_ref, other_entity in context.qualified_entities.items():
|
|
178
|
-
if other_ref == entity.entity_ref:
|
|
179
|
-
continue
|
|
180
|
-
|
|
181
|
-
if other_entity.entity_type != EntityType.ORG:
|
|
182
|
-
continue
|
|
183
|
-
|
|
184
|
-
other_normalized = normalize_org_name(other_entity.original_text)
|
|
185
|
-
similarity = trigram_similarity(normalized_name, other_normalized)
|
|
186
|
-
|
|
187
|
-
if similarity > best_similarity and similarity >= self._fuzzy_threshold:
|
|
188
|
-
best_similarity = similarity
|
|
189
|
-
best_match = other_entity
|
|
190
|
-
|
|
191
|
-
if best_match and best_similarity >= self._fuzzy_threshold:
|
|
192
|
-
confidence = 0.85 + (best_similarity - self._fuzzy_threshold) * 0.1
|
|
193
|
-
confidence = min(confidence, 0.95)
|
|
194
|
-
|
|
195
|
-
return CanonicalMatch(
|
|
196
|
-
canonical_id=None,
|
|
197
|
-
canonical_name=best_match.original_text,
|
|
198
|
-
match_method="name_fuzzy",
|
|
199
|
-
match_confidence=confidence,
|
|
200
|
-
match_details={"similarity": best_similarity, "matched_entity": best_match.entity_ref},
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
return None
|
|
204
|
-
|
|
205
|
-
def format_fqn(
|
|
206
|
-
self,
|
|
207
|
-
entity: QualifiedEntity,
|
|
208
|
-
match: Optional[CanonicalMatch],
|
|
209
|
-
) -> str:
|
|
210
|
-
"""Format FQN for an organization."""
|
|
211
|
-
base_name = match.canonical_name if match else entity.original_text
|
|
212
|
-
|
|
213
|
-
parts = []
|
|
214
|
-
identifiers = entity.qualifiers.identifiers
|
|
215
|
-
|
|
216
|
-
# Add ticker if available
|
|
217
|
-
if "ticker" in identifiers:
|
|
218
|
-
parts.append(identifiers["ticker"])
|
|
219
|
-
|
|
220
|
-
# Add jurisdiction
|
|
221
|
-
if entity.qualifiers.jurisdiction:
|
|
222
|
-
parts.append(entity.qualifiers.jurisdiction)
|
|
223
|
-
|
|
224
|
-
if parts:
|
|
225
|
-
return f"{base_name} ({', '.join(parts)})"
|
|
226
|
-
return base_name
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
# Allow importing without decorator for testing
|
|
230
|
-
OrganizationCanonicalizerClass = OrganizationCanonicalizer
|
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
PersonCanonicalizer - Resolves PERSON entities to canonical forms.
|
|
3
|
-
|
|
4
|
-
Uses:
|
|
5
|
-
1. Name variants (Tim vs Timothy)
|
|
6
|
-
2. Role + org context matching
|
|
7
|
-
3. LLM identity verification for uncertain matches
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import logging
|
|
11
|
-
import re
|
|
12
|
-
from typing import Optional
|
|
13
|
-
|
|
14
|
-
from ..base import BaseCanonicalizerPlugin, PluginCapability
|
|
15
|
-
from ...pipeline.context import PipelineContext
|
|
16
|
-
from ...pipeline.registry import PluginRegistry
|
|
17
|
-
from ...models import QualifiedEntity, CanonicalMatch, EntityType
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
# Common name variations
|
|
22
|
-
NAME_VARIANTS = {
|
|
23
|
-
"tim": ["timothy"],
|
|
24
|
-
"timothy": ["tim"],
|
|
25
|
-
"mike": ["michael"],
|
|
26
|
-
"michael": ["mike"],
|
|
27
|
-
"bob": ["robert"],
|
|
28
|
-
"robert": ["bob", "rob"],
|
|
29
|
-
"rob": ["robert"],
|
|
30
|
-
"bill": ["william"],
|
|
31
|
-
"william": ["bill", "will"],
|
|
32
|
-
"will": ["william"],
|
|
33
|
-
"jim": ["james"],
|
|
34
|
-
"james": ["jim", "jimmy"],
|
|
35
|
-
"jimmy": ["james"],
|
|
36
|
-
"tom": ["thomas"],
|
|
37
|
-
"thomas": ["tom", "tommy"],
|
|
38
|
-
"joe": ["joseph"],
|
|
39
|
-
"joseph": ["joe"],
|
|
40
|
-
"alex": ["alexander", "alexandra"],
|
|
41
|
-
"alexander": ["alex"],
|
|
42
|
-
"alexandra": ["alex"],
|
|
43
|
-
"dan": ["daniel"],
|
|
44
|
-
"daniel": ["dan", "danny"],
|
|
45
|
-
"dave": ["david"],
|
|
46
|
-
"david": ["dave"],
|
|
47
|
-
"ed": ["edward", "edwin"],
|
|
48
|
-
"edward": ["ed", "eddie"],
|
|
49
|
-
"jen": ["jennifer"],
|
|
50
|
-
"jennifer": ["jen", "jenny"],
|
|
51
|
-
"kate": ["katherine", "catherine"],
|
|
52
|
-
"katherine": ["kate", "kathy"],
|
|
53
|
-
"catherine": ["kate", "cathy"],
|
|
54
|
-
"chris": ["christopher", "christine"],
|
|
55
|
-
"christopher": ["chris"],
|
|
56
|
-
"christine": ["chris"],
|
|
57
|
-
"matt": ["matthew"],
|
|
58
|
-
"matthew": ["matt"],
|
|
59
|
-
"nick": ["nicholas"],
|
|
60
|
-
"nicholas": ["nick"],
|
|
61
|
-
"sam": ["samuel", "samantha"],
|
|
62
|
-
"samuel": ["sam"],
|
|
63
|
-
"samantha": ["sam"],
|
|
64
|
-
"steve": ["steven", "stephen"],
|
|
65
|
-
"steven": ["steve"],
|
|
66
|
-
"stephen": ["steve"],
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def normalize_person_name(name: str) -> str:
|
|
71
|
-
"""Normalize a person name for matching."""
|
|
72
|
-
# Remove titles
|
|
73
|
-
name = re.sub(r'^(Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Prof\.?)\s+', '', name, flags=re.IGNORECASE)
|
|
74
|
-
# Remove suffixes
|
|
75
|
-
name = re.sub(r'\s+(Jr\.?|Sr\.?|III?|IV|V)$', '', name, flags=re.IGNORECASE)
|
|
76
|
-
return name.strip().lower()
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def get_name_parts(name: str) -> tuple[str, str]:
|
|
80
|
-
"""Split name into first and last name."""
|
|
81
|
-
normalized = normalize_person_name(name)
|
|
82
|
-
parts = normalized.split()
|
|
83
|
-
if len(parts) >= 2:
|
|
84
|
-
return parts[0], parts[-1]
|
|
85
|
-
elif len(parts) == 1:
|
|
86
|
-
return parts[0], ""
|
|
87
|
-
return "", ""
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def names_match(name1: str, name2: str) -> tuple[bool, float, bool]:
|
|
91
|
-
"""
|
|
92
|
-
Check if two names match, considering variants.
|
|
93
|
-
|
|
94
|
-
Returns (matches, confidence, is_variant).
|
|
95
|
-
"""
|
|
96
|
-
first1, last1 = get_name_parts(name1)
|
|
97
|
-
first2, last2 = get_name_parts(name2)
|
|
98
|
-
|
|
99
|
-
# Last names must match (if both present)
|
|
100
|
-
if last1 and last2 and last1 != last2:
|
|
101
|
-
return False, 0.0, False
|
|
102
|
-
|
|
103
|
-
# Check first name match
|
|
104
|
-
if first1 == first2:
|
|
105
|
-
return True, 1.0, False
|
|
106
|
-
|
|
107
|
-
# Check variants
|
|
108
|
-
variants1 = NAME_VARIANTS.get(first1, [])
|
|
109
|
-
variants2 = NAME_VARIANTS.get(first2, [])
|
|
110
|
-
|
|
111
|
-
if first2 in variants1 or first1 in variants2:
|
|
112
|
-
return True, 0.9, True
|
|
113
|
-
|
|
114
|
-
return False, 0.0, False
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
@PluginRegistry.canonicalizer
|
|
118
|
-
class PersonCanonicalizer(BaseCanonicalizerPlugin):
|
|
119
|
-
"""
|
|
120
|
-
Canonicalizer for PERSON entities.
|
|
121
|
-
|
|
122
|
-
Uses name variants and context matching.
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
def __init__(
|
|
126
|
-
self,
|
|
127
|
-
use_context_matching: bool = True,
|
|
128
|
-
):
|
|
129
|
-
"""
|
|
130
|
-
Initialize the person canonicalizer.
|
|
131
|
-
|
|
132
|
-
Args:
|
|
133
|
-
use_context_matching: Whether to use role+org for disambiguation
|
|
134
|
-
"""
|
|
135
|
-
self._use_context_matching = use_context_matching
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def name(self) -> str:
|
|
139
|
-
return "person_canonicalizer"
|
|
140
|
-
|
|
141
|
-
@property
|
|
142
|
-
def priority(self) -> int:
|
|
143
|
-
return 10
|
|
144
|
-
|
|
145
|
-
@property
|
|
146
|
-
def capabilities(self) -> PluginCapability:
|
|
147
|
-
return PluginCapability.CACHING
|
|
148
|
-
|
|
149
|
-
@property
|
|
150
|
-
def description(self) -> str:
|
|
151
|
-
return "Resolves PERSON entities using name variants and context"
|
|
152
|
-
|
|
153
|
-
@property
|
|
154
|
-
def supported_entity_types(self) -> set[EntityType]:
|
|
155
|
-
return {EntityType.PERSON}
|
|
156
|
-
|
|
157
|
-
def find_canonical(
|
|
158
|
-
self,
|
|
159
|
-
entity: QualifiedEntity,
|
|
160
|
-
context: PipelineContext,
|
|
161
|
-
) -> Optional[CanonicalMatch]:
|
|
162
|
-
"""
|
|
163
|
-
Find canonical form for a PERSON entity.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
entity: Qualified entity to canonicalize
|
|
167
|
-
context: Pipeline context
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
CanonicalMatch if found
|
|
171
|
-
"""
|
|
172
|
-
# Look for matching PERSON entities in context
|
|
173
|
-
best_match = None
|
|
174
|
-
best_confidence = 0.0
|
|
175
|
-
|
|
176
|
-
best_is_variant = False
|
|
177
|
-
|
|
178
|
-
for other_ref, other_entity in context.qualified_entities.items():
|
|
179
|
-
if other_ref == entity.entity_ref:
|
|
180
|
-
continue
|
|
181
|
-
|
|
182
|
-
if other_entity.entity_type != EntityType.PERSON:
|
|
183
|
-
continue
|
|
184
|
-
|
|
185
|
-
# Check name match
|
|
186
|
-
matches, confidence, is_variant = names_match(entity.original_text, other_entity.original_text)
|
|
187
|
-
if not matches:
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
# Boost confidence if role+org also match
|
|
191
|
-
if self._use_context_matching and confidence > 0:
|
|
192
|
-
my_qualifiers = entity.qualifiers
|
|
193
|
-
other_qualifiers = other_entity.qualifiers
|
|
194
|
-
|
|
195
|
-
if my_qualifiers.role and other_qualifiers.role:
|
|
196
|
-
if my_qualifiers.role.lower() == other_qualifiers.role.lower():
|
|
197
|
-
confidence = min(confidence + 0.05, 1.0)
|
|
198
|
-
|
|
199
|
-
if my_qualifiers.org and other_qualifiers.org:
|
|
200
|
-
if my_qualifiers.org.lower() == other_qualifiers.org.lower():
|
|
201
|
-
confidence = min(confidence + 0.05, 1.0)
|
|
202
|
-
|
|
203
|
-
if confidence > best_confidence:
|
|
204
|
-
best_confidence = confidence
|
|
205
|
-
best_match = other_entity
|
|
206
|
-
best_is_variant = is_variant
|
|
207
|
-
|
|
208
|
-
if best_match and best_confidence >= 0.8:
|
|
209
|
-
return CanonicalMatch(
|
|
210
|
-
canonical_id=None,
|
|
211
|
-
canonical_name=best_match.original_text,
|
|
212
|
-
match_method="name_variant" if best_is_variant else "name_exact",
|
|
213
|
-
match_confidence=best_confidence,
|
|
214
|
-
match_details={"matched_entity": best_match.entity_ref},
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
return None
|
|
218
|
-
|
|
219
|
-
def format_fqn(
|
|
220
|
-
self,
|
|
221
|
-
entity: QualifiedEntity,
|
|
222
|
-
match: Optional[CanonicalMatch],
|
|
223
|
-
) -> str:
|
|
224
|
-
"""Format FQN for a person."""
|
|
225
|
-
base_name = match.canonical_name if match else entity.original_text
|
|
226
|
-
|
|
227
|
-
parts = []
|
|
228
|
-
qualifiers = entity.qualifiers
|
|
229
|
-
|
|
230
|
-
if qualifiers.role:
|
|
231
|
-
parts.append(qualifiers.role)
|
|
232
|
-
|
|
233
|
-
if qualifiers.org:
|
|
234
|
-
parts.append(qualifiers.org)
|
|
235
|
-
|
|
236
|
-
if parts:
|
|
237
|
-
return f"{base_name} ({', '.join(parts)})"
|
|
238
|
-
return base_name
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
# Allow importing without decorator for testing
|
|
242
|
-
PersonCanonicalizerClass = PersonCanonicalizer
|
|
File without changes
|
|
File without changes
|