nomenklatura-mpt 4.1.12__py3-none-any.whl → 4.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/matching/logic_v2/names/match.py +63 -4
- nomenklatura/matching/logic_v4/model.py +7 -14
- {nomenklatura_mpt-4.1.12.dist-info → nomenklatura_mpt-4.1.14.dist-info}/METADATA +3 -23
- {nomenklatura_mpt-4.1.12.dist-info → nomenklatura_mpt-4.1.14.dist-info}/RECORD +7 -7
- {nomenklatura_mpt-4.1.12.dist-info → nomenklatura_mpt-4.1.14.dist-info}/WHEEL +0 -0
- {nomenklatura_mpt-4.1.12.dist-info → nomenklatura_mpt-4.1.14.dist-info}/entry_points.txt +0 -0
- {nomenklatura_mpt-4.1.12.dist-info → nomenklatura_mpt-4.1.14.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,7 @@ from rigour.names import remove_obj_prefixes
|
|
5
5
|
from followthemoney.proxy import E, EntityProxy
|
6
6
|
from followthemoney import model
|
7
7
|
from followthemoney.types import registry
|
8
|
-
from
|
8
|
+
from rigour.names import schema_type_tag
|
9
9
|
|
10
10
|
from nomenklatura.matching.logic_v2.names.analysis import entity_names
|
11
11
|
from nomenklatura.matching.logic_v2.names.magic import weight_extra_match
|
@@ -198,15 +198,74 @@ def name_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
|
198
198
|
|
199
199
|
|
200
200
|
def name_match_levenshtein(query: E, result: E, config: ScoringConfig) -> FtResult:
|
201
|
-
"""Match two entities
|
201
|
+
"""Match two entities using Levenshtein distance on their normalized names.
|
202
|
+
|
203
|
+
This function compares entities by computing the Levenshtein distance between
|
204
|
+
all pairs of their names and returns the best match score.
|
205
|
+
|
206
|
+
Args:
|
207
|
+
query: The query entity to match
|
208
|
+
result: The result entity to match against
|
209
|
+
config: Scoring configuration parameters
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
FtResult with score (0.0-1.0) and detail string explaining the match
|
213
|
+
"""
|
202
214
|
schema = model.common_schema(query.schema, result.schema)
|
203
215
|
type_tag = schema_type_tag(schema)
|
204
216
|
best = FtResult(score=0.0, detail=None)
|
217
|
+
|
205
218
|
if type_tag == NameTypeTag.UNK:
|
206
219
|
# Name matching is not supported for entities that are not listed
|
207
220
|
# as a person, organization, or a thing.
|
208
221
|
best.detail = "Unsuited for name matching: %s" % schema.name
|
209
222
|
return best
|
210
|
-
|
211
|
-
|
223
|
+
|
224
|
+
# For object types (vessels, assets, etc.), use the existing object name matching
|
225
|
+
if type_tag == NameTypeTag.OBJ:
|
226
|
+
return match_object_names(query, result, config)
|
227
|
+
|
228
|
+
# Get all names for both entities
|
229
|
+
query_names = entity_names(type_tag, query, is_query=True)
|
230
|
+
result_names = entity_names(type_tag, result)
|
231
|
+
|
232
|
+
if not query_names or not result_names:
|
233
|
+
best.detail = "No names available for matching"
|
234
|
+
return best
|
235
|
+
|
236
|
+
# Check for literal matches first (early return for efficiency)
|
237
|
+
query_comparable = {name.comparable: name for name in query_names}
|
238
|
+
result_comparable = {name.comparable: name for name in result_names}
|
239
|
+
common = set(query_comparable).intersection(result_comparable)
|
240
|
+
if len(common) > 0:
|
241
|
+
longest = max(common, key=len)
|
242
|
+
best.detail = f"[{longest!r} literalMatch]"
|
243
|
+
return FtResult(score=1.0, detail=best.detail)
|
244
|
+
|
245
|
+
# Consolidate names (remove short names contained in longer names)
|
246
|
+
query_names = Name.consolidate_names(query_names)
|
247
|
+
result_names = Name.consolidate_names(result_names)
|
248
|
+
|
249
|
+
# Compare all pairs of names using Levenshtein distance
|
250
|
+
for query_name in query_names:
|
251
|
+
for result_name in result_names:
|
252
|
+
# Get the comparable forms (normalized strings)
|
253
|
+
query_str = query_name.comparable
|
254
|
+
result_str = result_name.comparable
|
255
|
+
|
256
|
+
# Use strict_levenshtein which already implements a good scoring mechanism
|
257
|
+
# max_rate=4 means we allow up to len/4 edits
|
258
|
+
score = strict_levenshtein(query_str, result_str, max_rate=4)
|
259
|
+
|
260
|
+
if score > best.score:
|
261
|
+
best.score = score
|
262
|
+
if score == 1.0:
|
263
|
+
best.detail = f"[{query_str!r}≡{result_str!r}, exactMatch]"
|
264
|
+
else:
|
265
|
+
best.detail = f"[{query_str!r}≈{result_str!r}, levenshteinScore: {score:.4f}]"
|
266
|
+
|
267
|
+
if best.detail is None:
|
268
|
+
best.detail = "No sufficient Levenshtein match found"
|
269
|
+
|
270
|
+
return best
|
212
271
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import Dict, List
|
2
2
|
|
3
|
-
from nomenklatura.matching.logic_v1.
|
4
|
-
from nomenklatura.matching.
|
3
|
+
from nomenklatura.matching.logic_v1.identifiers import orgid_disjoint
|
4
|
+
from nomenklatura.matching.logic_v3.multi import numbers_mismatch
|
5
5
|
from nomenklatura.matching.types import Feature, FtResult, HeuristicAlgorithm
|
6
6
|
from nomenklatura.matching.types import ConfigVar, ConfigVarType
|
7
7
|
from nomenklatura.matching.compare.countries import country_mismatch
|
@@ -9,17 +9,14 @@ from nomenklatura.matching.compare.gender import gender_mismatch
|
|
9
9
|
from nomenklatura.matching.compare.identifiers import crypto_wallet_address
|
10
10
|
from nomenklatura.matching.compare.identifiers import identifier_match
|
11
11
|
from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
|
12
|
-
from nomenklatura.matching.compare.names import weak_alias_match
|
12
|
+
from nomenklatura.matching.compare.names import last_name_mismatch, weak_alias_match
|
13
13
|
from nomenklatura.matching.compare.addresses import address_entity_match
|
14
|
-
from nomenklatura.matching.compare.addresses import address_prop_match
|
15
14
|
from nomenklatura.matching.logic_v2.names.match import name_match_levenshtein
|
16
15
|
from nomenklatura.matching.logic_v2.identifiers import bic_code_match
|
17
16
|
from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
|
18
17
|
from nomenklatura.matching.logic_v2.identifiers import isin_security_match
|
19
18
|
from nomenklatura.matching.logic_v2.identifiers import lei_code_match
|
20
19
|
from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
|
21
|
-
from nomenklatura.matching.logic_v2.identifiers import uei_code_match
|
22
|
-
from nomenklatura.matching.logic_v2.identifiers import npi_code_match
|
23
20
|
from nomenklatura.matching.util import FNUL
|
24
21
|
|
25
22
|
|
@@ -34,10 +31,6 @@ class LogicV4(HeuristicAlgorithm):
|
|
34
31
|
NAME = "logic-v4"
|
35
32
|
features = [
|
36
33
|
Feature(func=name_match_levenshtein, weight=1.0),
|
37
|
-
Feature(func=FtResult.wrap(person_name_phonetic_match), weight=0.9),
|
38
|
-
# These are there so they can be enabled using custom weights:
|
39
|
-
Feature(func=FtResult.wrap(name_metaphone_match), weight=FNUL),
|
40
|
-
Feature(func=FtResult.wrap(name_soundex_match), weight=FNUL),
|
41
34
|
Feature(func=address_entity_match, weight=0.98),
|
42
35
|
Feature(func=crypto_wallet_address, weight=0.98),
|
43
36
|
Feature(func=isin_security_match, weight=0.98),
|
@@ -46,15 +39,15 @@ class LogicV4(HeuristicAlgorithm):
|
|
46
39
|
Feature(func=vessel_imo_mmsi_match, weight=0.95),
|
47
40
|
Feature(func=inn_code_match, weight=0.95),
|
48
41
|
Feature(func=bic_code_match, weight=0.95),
|
49
|
-
Feature(func=uei_code_match, weight=0.95),
|
50
|
-
Feature(func=npi_code_match, weight=0.95),
|
51
42
|
Feature(func=identifier_match, weight=0.85),
|
52
43
|
Feature(func=weak_alias_match, weight=0.8),
|
53
|
-
Feature(func=address_prop_match, weight=0.2, qualifier=True),
|
54
44
|
Feature(func=country_mismatch, weight=-0.2, qualifier=True),
|
45
|
+
Feature(func=FtResult.wrap(last_name_mismatch), weight=-0.2, qualifier=True),
|
55
46
|
Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
|
56
|
-
Feature(func=dob_day_disjoint, weight=-0.
|
47
|
+
Feature(func=dob_day_disjoint, weight=-0.2, qualifier=True),
|
57
48
|
Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
|
49
|
+
Feature(func=orgid_disjoint, weight=-0.2, qualifier=True),
|
50
|
+
Feature(func=numbers_mismatch, weight=-0.1, qualifier=True),
|
58
51
|
]
|
59
52
|
CONFIG = {
|
60
53
|
"nm_number_mismatch": ConfigVar(
|
@@ -1,39 +1,19 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nomenklatura_mpt
|
3
|
-
Version: 4.1.
|
3
|
+
Version: 4.1.14
|
4
4
|
Summary: Make record linkages in followthemoney data.
|
5
5
|
Project-URL: Documentation, https://github.com/opensanctions/nomenklatura/
|
6
6
|
Project-URL: Repository, https://github.com/opensanctions/nomenklatura.git
|
7
7
|
Project-URL: Issues, https://github.com/opensanctions/nomenklatura/issues
|
8
8
|
Author-email: OpenSanctions <info@opensanctions.org>
|
9
|
-
License:
|
10
|
-
Copyright (c) 2023-2025, OpenSanctions Datenbanken GmbH
|
11
|
-
|
12
|
-
Permission is hereby granted, free of charge, to any person obtaining a
|
13
|
-
copy of this software and associated documentation files (the
|
14
|
-
"Software"), to deal in the Software without restriction, including
|
15
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
16
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
17
|
-
permit persons to whom the Software is furnished to do so, subject to
|
18
|
-
the following conditions:
|
19
|
-
|
20
|
-
The above copyright notice and this permission notice shall be included
|
21
|
-
in all copies or substantial portions of the Software.
|
22
|
-
|
23
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
24
|
-
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
25
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
26
|
-
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
27
|
-
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
28
|
-
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
29
|
-
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
9
|
+
License: MIT
|
30
10
|
License-File: LICENSE
|
31
11
|
Classifier: Intended Audience :: Developers
|
32
12
|
Classifier: License :: OSI Approved :: MIT License
|
33
13
|
Classifier: Operating System :: OS Independent
|
34
14
|
Classifier: Programming Language :: Python :: 3.11
|
35
15
|
Classifier: Programming Language :: Python :: 3.12
|
36
|
-
Requires-Python:
|
16
|
+
Requires-Python: <3.14,>=3.11
|
37
17
|
Requires-Dist: click
|
38
18
|
Requires-Dist: fingerprints
|
39
19
|
Requires-Dist: followthemoney
|
@@ -63,7 +63,7 @@ nomenklatura/matching/logic_v2/names/__init__.py,sha256=PQ_meSq_MYSMZ_6NgLniKPDl
|
|
63
63
|
nomenklatura/matching/logic_v2/names/analysis.py,sha256=1EPNOSKU_lEOGalsRkTrpUu3DhX5NvKFH3qUaVFZeF8,2264
|
64
64
|
nomenklatura/matching/logic_v2/names/distance.py,sha256=G4q2AeBs009f9uoose06uYnhT_0vneelFusIzS-1NqQ,7005
|
65
65
|
nomenklatura/matching/logic_v2/names/magic.py,sha256=mWAqmMU-eB-Kp-XVNkRSxdECel3j25OiXsESDyqT5ws,2380
|
66
|
-
nomenklatura/matching/logic_v2/names/match.py,sha256=
|
66
|
+
nomenklatura/matching/logic_v2/names/match.py,sha256=7qt9f9kxpPRjW2YvetmidcDPQcv-a4TZT6zt9XCpsEw,12364
|
67
67
|
nomenklatura/matching/logic_v2/names/pairing.py,sha256=bmP3x21zvVTddUKG8HMfLxZ457ItbeO-3wNTrtQ5Ya8,2943
|
68
68
|
nomenklatura/matching/logic_v2/names/util.py,sha256=AHkg9k23O7HSv5uA1TCz-OYE9jlXuNDnCE_IXKltnFI,3061
|
69
69
|
nomenklatura/matching/logic_v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -73,7 +73,7 @@ nomenklatura/matching/logic_v3/multi.py,sha256=fdyBT89c0KsTtnobS-kPpeP7M15kaXoNw
|
|
73
73
|
nomenklatura/matching/logic_v3/phonetic.py,sha256=fAVvuLcNrnPc5llLiAiV1pdsIgdtdr3EIZCrozi4HZQ,4932
|
74
74
|
nomenklatura/matching/logic_v4/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
75
75
|
nomenklatura/matching/logic_v4/identifiers.py,sha256=soCJPu2EwaEbJvgHVXiUgfg8rGgoc_cSB-zH33fEFWM,4152
|
76
|
-
nomenklatura/matching/logic_v4/model.py,sha256=
|
76
|
+
nomenklatura/matching/logic_v4/model.py,sha256=AqR-sq-GqlU-em8DlHRQEyjadkidXvirbBDCQ3wLJ8E,4888
|
77
77
|
nomenklatura/matching/logic_v4/multi.py,sha256=fdyBT89c0KsTtnobS-kPpeP7M15kaXoNwMgXvRHdpwE,917
|
78
78
|
nomenklatura/matching/logic_v4/phonetic.py,sha256=fAVvuLcNrnPc5llLiAiV1pdsIgdtdr3EIZCrozi4HZQ,4932
|
79
79
|
nomenklatura/matching/name_based/__init__.py,sha256=1_CM8vvr1KpTZq-LfK9hU0NPLL4GfVo0KuwG5-Atc7g,186
|
@@ -122,8 +122,8 @@ nomenklatura/wikidata/props.py,sha256=7owcZFHK5Fa6kS8fiH-qJ5rorj2XibV62ayX7prZJy
|
|
122
122
|
nomenklatura/wikidata/qualified.py,sha256=glH7Oo_QgNG25VGdYqFykz7WIToDsinh7fJEdC4u8gU,1481
|
123
123
|
nomenklatura/wikidata/query.py,sha256=FR012bJPSJ9cdhGId3JoVL3g-VBSzgbvgCIl7Dh5xC8,2189
|
124
124
|
nomenklatura/wikidata/value.py,sha256=CNT5uB5nsHRO1w2gByesGoo6KTss3aild1U8TXLHTSI,2979
|
125
|
-
nomenklatura_mpt-4.1.
|
126
|
-
nomenklatura_mpt-4.1.
|
127
|
-
nomenklatura_mpt-4.1.
|
128
|
-
nomenklatura_mpt-4.1.
|
129
|
-
nomenklatura_mpt-4.1.
|
125
|
+
nomenklatura_mpt-4.1.14.dist-info/METADATA,sha256=YC68zK6s0I43uKeDlY8khcuitBFe2BRDktbJcPrQudM,6775
|
126
|
+
nomenklatura_mpt-4.1.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
127
|
+
nomenklatura_mpt-4.1.14.dist-info/entry_points.txt,sha256=jL6tKzNuFy4t00OWLf66NsLUi070GgfncBhiBsYX5fs,80
|
128
|
+
nomenklatura_mpt-4.1.14.dist-info/licenses/LICENSE,sha256=bAU8lurcfhKXu0FKlBx8gvWWPixioihFCEHRkuBpCQ8,1126
|
129
|
+
nomenklatura_mpt-4.1.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|