nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
from itertools import product
|
2
|
+
from rigour.ids import LEI, ISIN, INN, OGRN, IMO, BIC
|
3
|
+
from rigour.ids import StrictFormat
|
4
|
+
from rigour.text.distance import levenshtein
|
5
|
+
from followthemoney import E, registry
|
6
|
+
|
7
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
8
|
+
from nomenklatura.matching.util import has_schema, type_pair
|
9
|
+
from nomenklatura.matching.compare.util import clean_map, CleanFunc
|
10
|
+
|
11
|
+
|
12
|
+
def _id_prop_match(
|
13
|
+
query: E,
|
14
|
+
result: E,
|
15
|
+
prop_name: str,
|
16
|
+
clean: CleanFunc = None,
|
17
|
+
) -> bool:
|
18
|
+
"""Check if a specific property identifier is shared by two entities."""
|
19
|
+
prop = query.schema.get(prop_name)
|
20
|
+
if prop is None:
|
21
|
+
return False
|
22
|
+
lv = clean_map(query.get(prop), clean=clean)
|
23
|
+
if not len(lv):
|
24
|
+
return False
|
25
|
+
rv_ = result.get_type_values(prop.type, matchable=True)
|
26
|
+
rv = clean_map(rv_, clean=clean)
|
27
|
+
common = lv.intersection(rv)
|
28
|
+
return len(common) > 0
|
29
|
+
|
30
|
+
|
31
|
+
def _bidi_id_prop_match(
|
32
|
+
query: E,
|
33
|
+
result: E,
|
34
|
+
prop_name: str,
|
35
|
+
clean: CleanFunc = None,
|
36
|
+
) -> FtResult:
|
37
|
+
"""Check if a specific property identifier is shared by two entities."""
|
38
|
+
if _id_prop_match(query, result, prop_name, clean=clean):
|
39
|
+
return FtResult(score=1.0, detail="Property match: %r" % prop_name)
|
40
|
+
if _id_prop_match(result, query, prop_name, clean=clean):
|
41
|
+
return FtResult(score=1.0, detail="Property match: %r" % prop_name)
|
42
|
+
return FtResult(score=0.0, detail="No match: %r" % prop_name)
|
43
|
+
|
44
|
+
|
45
|
+
def lei_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
46
|
+
"""Two entities have the same Legal Entity Identifier."""
|
47
|
+
return _bidi_id_prop_match(query, result, "leiCode", LEI.normalize)
|
48
|
+
|
49
|
+
|
50
|
+
def bic_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
51
|
+
"""Two entities have the same SWIFT BIC."""
|
52
|
+
return _bidi_id_prop_match(query, result, "swiftBic", BIC.normalize)
|
53
|
+
|
54
|
+
|
55
|
+
def ogrn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
56
|
+
"""Two entities have the same Russian company registration (OGRN) code."""
|
57
|
+
return _bidi_id_prop_match(query, result, "ogrnCode", OGRN.normalize)
|
58
|
+
|
59
|
+
|
60
|
+
def inn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
61
|
+
"""Two entities have the same Russian tax identifier (INN)."""
|
62
|
+
return _bidi_id_prop_match(query, result, "innCode", INN.normalize)
|
63
|
+
|
64
|
+
|
65
|
+
def isin_security_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
66
|
+
"""Two securities have the same ISIN."""
|
67
|
+
if not has_schema(query, result, "Security"):
|
68
|
+
return FtResult(score=0.0, detail="None of the entities is a security")
|
69
|
+
return _bidi_id_prop_match(query, result, "isin", ISIN.normalize)
|
70
|
+
|
71
|
+
|
72
|
+
def vessel_imo_mmsi_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
73
|
+
"""Two vessels have the same IMO or MMSI identifier."""
|
74
|
+
imo_res = _bidi_id_prop_match(query, result, "imoNumber", IMO.normalize)
|
75
|
+
if imo_res.score > 0.0:
|
76
|
+
return imo_res
|
77
|
+
return _bidi_id_prop_match(query, result, "mmsi")
|
78
|
+
|
79
|
+
|
80
|
+
def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
|
81
|
+
"""Two companies or organizations have different tax identifiers or registration
|
82
|
+
numbers."""
|
83
|
+
if not has_schema(query, result, "Organization"):
|
84
|
+
return FtResult(score=0.0, detail=None)
|
85
|
+
query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
|
86
|
+
query_ids = clean_map(query_ids_, StrictFormat.normalize)
|
87
|
+
result_ids = clean_map(result_ids_, StrictFormat.normalize)
|
88
|
+
if not len(query_ids) or not len(result_ids):
|
89
|
+
return FtResult(score=0.0, detail=None)
|
90
|
+
common = query_ids.intersection(result_ids)
|
91
|
+
if len(common) > 0:
|
92
|
+
return FtResult(score=0.0, detail=None)
|
93
|
+
max_ratio = 0.0
|
94
|
+
for query_id, result_id in product(query_ids, result_ids):
|
95
|
+
distance = levenshtein(query_id, result_id)
|
96
|
+
max_len = max(len(query_id), len(result_id))
|
97
|
+
ratio = 1.0 - (distance / float(max_len))
|
98
|
+
if ratio > 0.7:
|
99
|
+
max_ratio = max(max_ratio, ratio)
|
100
|
+
detail = "Mismatched identifiers: %s vs %s" % (
|
101
|
+
", ".join(query_ids),
|
102
|
+
", ".join(result_ids),
|
103
|
+
)
|
104
|
+
return FtResult(score=1 - max_ratio, detail=detail)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from typing import Dict, List
|
2
|
+
|
3
|
+
from nomenklatura.matching.types import Feature, HeuristicAlgorithm, FtResult
|
4
|
+
from nomenklatura.matching.compare.countries import country_mismatch
|
5
|
+
from nomenklatura.matching.compare.gender import gender_mismatch
|
6
|
+
from nomenklatura.matching.compare.identifiers import crypto_wallet_address
|
7
|
+
from nomenklatura.matching.compare.identifiers import identifier_match
|
8
|
+
from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
|
9
|
+
from nomenklatura.matching.compare.names import person_name_jaro_winkler
|
10
|
+
from nomenklatura.matching.compare.names import last_name_mismatch, name_literal_match
|
11
|
+
from nomenklatura.matching.compare.names import name_fingerprint_levenshtein
|
12
|
+
from nomenklatura.matching.compare.names import weak_alias_match
|
13
|
+
from nomenklatura.matching.compare.addresses import address_entity_match
|
14
|
+
from nomenklatura.matching.logic_v1.phonetic import person_name_phonetic_match
|
15
|
+
from nomenklatura.matching.logic_v1.phonetic import name_soundex_match
|
16
|
+
from nomenklatura.matching.logic_v1.phonetic import name_metaphone_match
|
17
|
+
from nomenklatura.matching.logic_v1.identifiers import bic_code_match
|
18
|
+
from nomenklatura.matching.logic_v1.identifiers import inn_code_match, ogrn_code_match
|
19
|
+
from nomenklatura.matching.logic_v1.identifiers import isin_security_match
|
20
|
+
from nomenklatura.matching.logic_v1.identifiers import lei_code_match
|
21
|
+
from nomenklatura.matching.logic_v1.identifiers import vessel_imo_mmsi_match
|
22
|
+
from nomenklatura.matching.logic_v1.identifiers import orgid_disjoint
|
23
|
+
from nomenklatura.matching.logic_v1.multi import numbers_mismatch
|
24
|
+
from nomenklatura.matching.util import FNUL
|
25
|
+
|
26
|
+
|
27
|
+
class LogicV1(HeuristicAlgorithm):
|
28
|
+
"""A rule-based matching system that generates a set of basic scores via
|
29
|
+
name and identifier-based matching, and then qualifies that score using
|
30
|
+
supporting or contradicting features of the two entities."""
|
31
|
+
|
32
|
+
NAME = "logic-v1"
|
33
|
+
features = [
|
34
|
+
Feature(func=name_literal_match, weight=1.0),
|
35
|
+
Feature(func=FtResult.wrap(person_name_jaro_winkler), weight=0.8),
|
36
|
+
Feature(func=FtResult.wrap(person_name_phonetic_match), weight=0.9),
|
37
|
+
Feature(func=FtResult.wrap(name_fingerprint_levenshtein), weight=0.9),
|
38
|
+
# These are there so they can be enabled using custom weights:
|
39
|
+
Feature(func=FtResult.wrap(name_metaphone_match), weight=FNUL),
|
40
|
+
Feature(func=FtResult.wrap(name_soundex_match), weight=FNUL),
|
41
|
+
Feature(func=address_entity_match, weight=0.98),
|
42
|
+
Feature(func=crypto_wallet_address, weight=0.98),
|
43
|
+
Feature(func=isin_security_match, weight=0.98),
|
44
|
+
Feature(func=lei_code_match, weight=0.95),
|
45
|
+
Feature(func=ogrn_code_match, weight=0.95),
|
46
|
+
Feature(func=vessel_imo_mmsi_match, weight=0.95),
|
47
|
+
Feature(func=inn_code_match, weight=0.95),
|
48
|
+
Feature(func=bic_code_match, weight=0.95),
|
49
|
+
Feature(func=identifier_match, weight=0.85),
|
50
|
+
Feature(func=weak_alias_match, weight=0.8),
|
51
|
+
Feature(func=country_mismatch, weight=-0.2, qualifier=True),
|
52
|
+
Feature(func=FtResult.wrap(last_name_mismatch), weight=-0.2, qualifier=True),
|
53
|
+
Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
|
54
|
+
Feature(func=dob_day_disjoint, weight=-0.2, qualifier=True),
|
55
|
+
Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
|
56
|
+
Feature(func=orgid_disjoint, weight=-0.2, qualifier=True),
|
57
|
+
Feature(func=numbers_mismatch, weight=-0.1, qualifier=True),
|
58
|
+
]
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
def compute_score(
|
62
|
+
cls, scores: Dict[str, float], weights: Dict[str, float]
|
63
|
+
) -> float:
|
64
|
+
mains: List[float] = []
|
65
|
+
for feat in cls.features:
|
66
|
+
if feat.qualifier:
|
67
|
+
continue
|
68
|
+
weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
|
69
|
+
mains.append(weight)
|
70
|
+
score = max(mains)
|
71
|
+
for feat in cls.features:
|
72
|
+
if not feat.qualifier:
|
73
|
+
continue
|
74
|
+
weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
|
75
|
+
score += weight
|
76
|
+
return score
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from followthemoney.proxy import E
|
2
|
+
from followthemoney.types import registry
|
3
|
+
|
4
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
5
|
+
from nomenklatura.matching.compare.util import extract_numbers
|
6
|
+
from nomenklatura.matching.util import type_pair, has_schema
|
7
|
+
|
8
|
+
|
9
|
+
def numbers_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
|
10
|
+
"""Find numbers in names and addresses and penalise different numbers."""
|
11
|
+
if has_schema(query, result, "Address"):
|
12
|
+
qv, rv = type_pair(query, result, registry.address)
|
13
|
+
else:
|
14
|
+
qv, rv = type_pair(query, result, registry.name)
|
15
|
+
qvn = extract_numbers(qv)
|
16
|
+
rvn = extract_numbers(rv)
|
17
|
+
base = min(len(qvn), len(rvn))
|
18
|
+
mismatch = len(qvn.difference(rvn))
|
19
|
+
# print("numbers_mismatch", mismatch, base, qvn, rvn)
|
20
|
+
score = float(mismatch) / float(max(1, base))
|
21
|
+
return FtResult(score=score, detail="Mismatching numbers: %s" % mismatch)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
from functools import cached_property
|
2
|
+
from typing import List, Optional
|
3
|
+
from itertools import product
|
4
|
+
from normality import ascii_text
|
5
|
+
from followthemoney.proxy import E
|
6
|
+
from followthemoney.types import registry
|
7
|
+
from rigour.text.scripts import can_latinize
|
8
|
+
from rigour.text.distance import is_levenshtein_plausible
|
9
|
+
from rigour.text.phonetics import metaphone, soundex
|
10
|
+
from rigour.names import tokenize_name
|
11
|
+
from rigour.util import list_intersection
|
12
|
+
|
13
|
+
from nomenklatura.matching.util import type_pair, has_schema
|
14
|
+
from nomenklatura.matching.compat import fingerprint_name, name_words
|
15
|
+
|
16
|
+
|
17
|
+
class NameTokenPhonetic:
|
18
|
+
def __init__(self, token: str):
|
19
|
+
self.token = token
|
20
|
+
self.ascii = ascii_text(token) if can_latinize(token) else None
|
21
|
+
|
22
|
+
@cached_property
|
23
|
+
def metaphone(self) -> Optional[str]:
|
24
|
+
if self.ascii is not None:
|
25
|
+
phoneme = metaphone(self.ascii)
|
26
|
+
if len(phoneme) >= 3:
|
27
|
+
return phoneme
|
28
|
+
return None
|
29
|
+
|
30
|
+
# def __repr__(self) -> str:
|
31
|
+
# return f"<NameTokenPhonetic {self.token!r}, {self.ascii!r}, {self.metaphone!r}>"
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def from_name(cls, name: str) -> List["NameTokenPhonetic"]:
|
35
|
+
tokens = tokenize_name(name.lower(), token_min_length=2)
|
36
|
+
return [cls(token) for token in tokens]
|
37
|
+
|
38
|
+
|
39
|
+
def metaphone_token(token: str) -> str:
|
40
|
+
if token.isalpha() and len(token) > 1:
|
41
|
+
out = metaphone(token)
|
42
|
+
# doesn't handle non-ascii characters
|
43
|
+
if len(out) >= 3:
|
44
|
+
return out
|
45
|
+
return token.upper()
|
46
|
+
|
47
|
+
|
48
|
+
def soundex_token(token: str) -> str:
|
49
|
+
if token.isalpha() and len(token) > 1:
|
50
|
+
out = soundex(token)
|
51
|
+
# doesn't handle non-ascii characters
|
52
|
+
if len(out):
|
53
|
+
return out
|
54
|
+
return token.upper()
|
55
|
+
|
56
|
+
|
57
|
+
def compare_parts_phonetic(left: NameTokenPhonetic, right: NameTokenPhonetic) -> bool:
|
58
|
+
if left.metaphone is None or right.metaphone is None:
|
59
|
+
return left.ascii == right.ascii
|
60
|
+
if (
|
61
|
+
left.metaphone == right.metaphone
|
62
|
+
and left.ascii is not None
|
63
|
+
and right.ascii is not None
|
64
|
+
):
|
65
|
+
# Secondary check for Levenshtein distance:
|
66
|
+
if is_levenshtein_plausible(left.ascii, right.ascii):
|
67
|
+
return True
|
68
|
+
return False
|
69
|
+
|
70
|
+
|
71
|
+
def _clean_phonetic_entity(original: str) -> Optional[str]:
|
72
|
+
"""Normalize a legal entity name without transliteration."""
|
73
|
+
if not can_latinize(original):
|
74
|
+
return None
|
75
|
+
return fingerprint_name(original)
|
76
|
+
|
77
|
+
|
78
|
+
def _token_names_compare(
|
79
|
+
query_names: List[List[str]], result_names: List[List[str]]
|
80
|
+
) -> float:
|
81
|
+
score = 0.0
|
82
|
+
for q, r in product(query_names, result_names):
|
83
|
+
# length = max(2.0, (len(q) + len(r)) / 2.0)
|
84
|
+
length = max(2.0, len(q))
|
85
|
+
combo = len(list_intersection(q, r)) / float(length)
|
86
|
+
score = max(score, combo)
|
87
|
+
return score
|
88
|
+
|
89
|
+
|
90
|
+
def person_name_phonetic_match(query: E, result: E) -> float:
|
91
|
+
"""Two persons have similar names, using a phonetic algorithm."""
|
92
|
+
if not has_schema(query, result, "Person"):
|
93
|
+
return 0.0
|
94
|
+
query_names_, result_names_ = type_pair(query, result, registry.name)
|
95
|
+
query_parts = [NameTokenPhonetic.from_name(n) for n in query_names_]
|
96
|
+
result_parts = [NameTokenPhonetic.from_name(n) for n in result_names_]
|
97
|
+
score = 0.0
|
98
|
+
for q, r in product(query_parts, result_parts):
|
99
|
+
if len(q) == 0:
|
100
|
+
continue
|
101
|
+
matches = list(r)
|
102
|
+
matched = 0
|
103
|
+
for part in q:
|
104
|
+
for other in matches:
|
105
|
+
if compare_parts_phonetic(part, other):
|
106
|
+
matches.remove(other)
|
107
|
+
matched += 1
|
108
|
+
break
|
109
|
+
score = max(score, matched / float(len(q)))
|
110
|
+
return score
|
111
|
+
|
112
|
+
|
113
|
+
def _metaphone_tokens(token: str) -> List[str]:
|
114
|
+
words: List[str] = []
|
115
|
+
for word in name_words(_clean_phonetic_entity(token), min_length=2):
|
116
|
+
words.append(metaphone_token(word))
|
117
|
+
return words
|
118
|
+
|
119
|
+
|
120
|
+
def name_metaphone_match(query: E, result: E) -> float:
|
121
|
+
"""Two entities (person and non-person) have similar names, using the metaphone
|
122
|
+
algorithm."""
|
123
|
+
query_names_, result_names_ = type_pair(query, result, registry.name)
|
124
|
+
query_names = [_metaphone_tokens(n) for n in query_names_]
|
125
|
+
result_names = [_metaphone_tokens(n) for n in result_names_]
|
126
|
+
return _token_names_compare(query_names, result_names)
|
127
|
+
|
128
|
+
|
129
|
+
def _soundex_tokens(token: str) -> List[str]:
|
130
|
+
words: List[str] = []
|
131
|
+
for word in name_words(_clean_phonetic_entity(token), min_length=2):
|
132
|
+
words.append(soundex_token(word))
|
133
|
+
return words
|
134
|
+
|
135
|
+
|
136
|
+
def name_soundex_match(query: E, result: E) -> float:
|
137
|
+
"""Two entities (person and non-person) have similar names, using the soundex
|
138
|
+
algorithm."""
|
139
|
+
query_names_, result_names_ = type_pair(query, result, registry.name)
|
140
|
+
query_names = [_soundex_tokens(n) for n in query_names_]
|
141
|
+
result_names = [_soundex_tokens(n) for n in result_names_]
|
142
|
+
return _token_names_compare(query_names, result_names)
|
File without changes
|
@@ -0,0 +1,124 @@
|
|
1
|
+
from typing import Set, Type
|
2
|
+
|
3
|
+
from rigour.ids import get_identifier_format, IdentifierFormat
|
4
|
+
from followthemoney import model
|
5
|
+
from followthemoney.property import Property
|
6
|
+
from followthemoney.types import registry
|
7
|
+
from followthemoney.proxy import EntityProxy
|
8
|
+
|
9
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
10
|
+
|
11
|
+
|
12
|
+
def _format_normalize(
|
13
|
+
format: Type[IdentifierFormat], entity: EntityProxy, prop: Property
|
14
|
+
) -> Set[str]:
|
15
|
+
values: Set[str] = set()
|
16
|
+
for value in entity.get(prop, quiet=True):
|
17
|
+
norm_value = format.normalize(value)
|
18
|
+
if norm_value is not None:
|
19
|
+
values.add(norm_value)
|
20
|
+
return values
|
21
|
+
|
22
|
+
|
23
|
+
def _identifier_format_match(
|
24
|
+
format_name: str, query: EntityProxy, result: EntityProxy
|
25
|
+
) -> FtResult:
|
26
|
+
"""Check if the identifier format is the same for two entities."""
|
27
|
+
schema = model.common_schema(query.schema, result.schema)
|
28
|
+
format = get_identifier_format(format_name)
|
29
|
+
query_identifiers: Set[str] = set()
|
30
|
+
query_format: Set[str] = set()
|
31
|
+
result_identifiers: Set[str] = set()
|
32
|
+
result_format: Set[str] = set()
|
33
|
+
for prop in schema.properties.values():
|
34
|
+
if prop.type != registry.identifier or not prop.matchable:
|
35
|
+
continue
|
36
|
+
if prop.format is not None and get_identifier_format(prop.format) != format:
|
37
|
+
continue
|
38
|
+
query_values = _format_normalize(format, query, prop)
|
39
|
+
query_identifiers.update(query_values)
|
40
|
+
result_values = _format_normalize(format, result, prop)
|
41
|
+
result_identifiers.update(result_values)
|
42
|
+
if prop.format is not None and get_identifier_format(prop.format) == format:
|
43
|
+
query_format.update(query_values)
|
44
|
+
result_format.update(result_values)
|
45
|
+
left_common = query_format.intersection(result_identifiers)
|
46
|
+
if len(left_common) > 0:
|
47
|
+
detail = f"Matched {format.TITLE}: {', '.join(left_common)}"
|
48
|
+
return FtResult(score=1.0, detail=detail)
|
49
|
+
right_common = result_format.intersection(query_identifiers)
|
50
|
+
if len(right_common) > 0:
|
51
|
+
detail = f"Matched {format.TITLE}: {', '.join(right_common)}"
|
52
|
+
return FtResult(score=1.0, detail=detail)
|
53
|
+
if format.STRONG:
|
54
|
+
non_common = query_identifiers.intersection(result_identifiers)
|
55
|
+
if len(non_common) > 0:
|
56
|
+
detail = f"Out-of-format match: {', '.join(non_common)}"
|
57
|
+
return FtResult(score=0.8, detail=detail)
|
58
|
+
return FtResult(score=0.0, detail=f"No {format.TITLE} match")
|
59
|
+
|
60
|
+
|
61
|
+
def lei_code_match(
|
62
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
63
|
+
) -> FtResult:
|
64
|
+
"""Two entities have the same Legal Entity Identifier."""
|
65
|
+
return _identifier_format_match("lei", query, result)
|
66
|
+
|
67
|
+
|
68
|
+
def bic_code_match(
|
69
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
70
|
+
) -> FtResult:
|
71
|
+
"""Two entities have the same SWIFT BIC."""
|
72
|
+
return _identifier_format_match("bic", query, result)
|
73
|
+
|
74
|
+
|
75
|
+
def ogrn_code_match(
|
76
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
77
|
+
) -> FtResult:
|
78
|
+
"""Two entities have the same Russian company registration (OGRN) code."""
|
79
|
+
return _identifier_format_match("ogrn", query, result)
|
80
|
+
|
81
|
+
|
82
|
+
def inn_code_match(
|
83
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
84
|
+
) -> FtResult:
|
85
|
+
"""Two entities have the same Russian tax identifier (INN)."""
|
86
|
+
return _identifier_format_match("inn", query, result)
|
87
|
+
|
88
|
+
|
89
|
+
def uei_code_match(
|
90
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
91
|
+
) -> FtResult:
|
92
|
+
"""Two entities have the same US Unique Entity ID (UEI)."""
|
93
|
+
return _identifier_format_match("uei", query, result)
|
94
|
+
|
95
|
+
|
96
|
+
def npi_code_match(
|
97
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
98
|
+
) -> FtResult:
|
99
|
+
"""Two entities have the same US National Provider Identifier (NPI)."""
|
100
|
+
return _identifier_format_match("npi", query, result)
|
101
|
+
|
102
|
+
|
103
|
+
def isin_security_match(
|
104
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
105
|
+
) -> FtResult:
|
106
|
+
"""Two securities have the same ISIN."""
|
107
|
+
# if not has_schema(query, result, "Security"):
|
108
|
+
# return 0.0
|
109
|
+
return _identifier_format_match("isin", query, result)
|
110
|
+
|
111
|
+
|
112
|
+
def vessel_imo_mmsi_match(
|
113
|
+
query: EntityProxy, result: EntityProxy, config: ScoringConfig
|
114
|
+
) -> FtResult:
|
115
|
+
"""Two vessels have the same IMO or MMSI identifier."""
|
116
|
+
imo_res = _identifier_format_match("imo", query, result)
|
117
|
+
if imo_res.score > 0.0:
|
118
|
+
return imo_res
|
119
|
+
query_mmsis = query.get("mmsi", quiet=True)
|
120
|
+
result_mmsis = result.get("mmsi", quiet=True)
|
121
|
+
score = registry.identifier.compare_sets(query_mmsis, result_mmsis)
|
122
|
+
if score > 0.0:
|
123
|
+
return FtResult(score=score, detail="MMSI match")
|
124
|
+
return FtResult(score=score, detail="No IMO or MMSI match")
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from typing import Dict, List
|
2
|
+
|
3
|
+
from nomenklatura.matching.types import Feature, HeuristicAlgorithm
|
4
|
+
from nomenklatura.matching.types import ConfigVar, ConfigVarType
|
5
|
+
from nomenklatura.matching.compare.countries import country_mismatch
|
6
|
+
from nomenklatura.matching.compare.gender import gender_mismatch
|
7
|
+
from nomenklatura.matching.compare.identifiers import crypto_wallet_address
|
8
|
+
from nomenklatura.matching.compare.identifiers import identifier_match
|
9
|
+
from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
|
10
|
+
from nomenklatura.matching.compare.names import weak_alias_match
|
11
|
+
from nomenklatura.matching.compare.addresses import address_entity_match
|
12
|
+
from nomenklatura.matching.compare.addresses import address_prop_match
|
13
|
+
from nomenklatura.matching.logic_v2.names.match import name_match
|
14
|
+
from nomenklatura.matching.logic_v2.identifiers import bic_code_match
|
15
|
+
from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
|
16
|
+
from nomenklatura.matching.logic_v2.identifiers import isin_security_match
|
17
|
+
from nomenklatura.matching.logic_v2.identifiers import lei_code_match
|
18
|
+
from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
|
19
|
+
from nomenklatura.matching.logic_v2.identifiers import uei_code_match
|
20
|
+
from nomenklatura.matching.logic_v2.identifiers import npi_code_match
|
21
|
+
from nomenklatura.matching.util import FNUL
|
22
|
+
|
23
|
+
|
24
|
+
class LogicV2(HeuristicAlgorithm):
|
25
|
+
"""A rule-based matching system that generates a set of basic scores via
|
26
|
+
name and identifier-based matching, and then qualifies that score using
|
27
|
+
supporting or contradicting features of the two entities. Version 2 uses
|
28
|
+
a different set of features and consolidates name matching into a single
|
29
|
+
feature, which uses a versatile and complex name matching algorithm."""
|
30
|
+
|
31
|
+
NAME = "logic-v2"
|
32
|
+
features = [
|
33
|
+
Feature(func=name_match, weight=1.0),
|
34
|
+
Feature(func=address_entity_match, weight=0.98),
|
35
|
+
Feature(func=crypto_wallet_address, weight=0.98),
|
36
|
+
Feature(func=isin_security_match, weight=0.98),
|
37
|
+
Feature(func=lei_code_match, weight=0.95),
|
38
|
+
Feature(func=ogrn_code_match, weight=0.95),
|
39
|
+
Feature(func=vessel_imo_mmsi_match, weight=0.95),
|
40
|
+
Feature(func=inn_code_match, weight=0.95),
|
41
|
+
Feature(func=bic_code_match, weight=0.95),
|
42
|
+
Feature(func=uei_code_match, weight=0.95),
|
43
|
+
Feature(func=npi_code_match, weight=0.95),
|
44
|
+
Feature(func=identifier_match, weight=0.85),
|
45
|
+
Feature(func=weak_alias_match, weight=0.8),
|
46
|
+
Feature(func=address_prop_match, weight=0.2, qualifier=True),
|
47
|
+
Feature(func=country_mismatch, weight=-0.2, qualifier=True),
|
48
|
+
Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
|
49
|
+
Feature(func=dob_day_disjoint, weight=-0.25, qualifier=True),
|
50
|
+
Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
|
51
|
+
]
|
52
|
+
CONFIG = {
|
53
|
+
"nm_number_mismatch": ConfigVar(
|
54
|
+
type=ConfigVarType.FLOAT,
|
55
|
+
description="Penalty for mismatching numbers in object or company names.",
|
56
|
+
default=0.3,
|
57
|
+
),
|
58
|
+
"nm_extra_query_name": ConfigVar(
|
59
|
+
type=ConfigVarType.FLOAT,
|
60
|
+
description="Weight for name parts in the query not matched to the result.",
|
61
|
+
default=0.8,
|
62
|
+
),
|
63
|
+
"nm_extra_result_name": ConfigVar(
|
64
|
+
type=ConfigVarType.FLOAT,
|
65
|
+
description="Weight for name parts in the result not matched to the query.",
|
66
|
+
default=0.2,
|
67
|
+
),
|
68
|
+
"nm_family_name_weight": ConfigVar(
|
69
|
+
type=ConfigVarType.FLOAT,
|
70
|
+
description="Extra weight multiplier for family name in person matches (John Smith vs. John Gruber is clearly distinct).",
|
71
|
+
default=1.3,
|
72
|
+
),
|
73
|
+
"nm_fuzzy_cutoff_factor": ConfigVar(
|
74
|
+
type=ConfigVarType.FLOAT,
|
75
|
+
description="Extra factor for when a fuzzy match is triggered in name matching. "
|
76
|
+
"Below a certain threshold, a fuzzy match is considered as a non-match (score = 0.0). "
|
77
|
+
"Adjusting this multiplier will raise this threshold, making a fuzzy match trigger more leniently.",
|
78
|
+
default=1.0,
|
79
|
+
),
|
80
|
+
}
|
81
|
+
|
82
|
+
@classmethod
|
83
|
+
def compute_score(
|
84
|
+
cls, scores: Dict[str, float], weights: Dict[str, float]
|
85
|
+
) -> float:
|
86
|
+
mains: List[float] = []
|
87
|
+
for feat in cls.features:
|
88
|
+
if feat.qualifier:
|
89
|
+
continue
|
90
|
+
weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
|
91
|
+
mains.append(weight)
|
92
|
+
score = max(mains)
|
93
|
+
for feat in cls.features:
|
94
|
+
if not feat.qualifier:
|
95
|
+
continue
|
96
|
+
weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
|
97
|
+
score += weight
|
98
|
+
return score
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from typing import Set
|
2
|
+
from rigour.names import NameTypeTag, Name
|
3
|
+
from rigour.names import replace_org_types_compare, prenormalize_name
|
4
|
+
from rigour.names import remove_person_prefixes, remove_org_prefixes
|
5
|
+
from rigour.names import tag_org_name, tag_person_name, normalize_name
|
6
|
+
from followthemoney import registry, EntityProxy
|
7
|
+
from followthemoney.names import PROP_PART_TAGS
|
8
|
+
|
9
|
+
|
10
|
+
def entity_names(
|
11
|
+
type_tag: NameTypeTag, entity: EntityProxy, is_query: bool = False
|
12
|
+
) -> Set[Name]:
|
13
|
+
"""This will transform the entity into a set of names with tags applied. The idea
|
14
|
+
is to tag the names with the type of entity they are, e.g. person, organization,
|
15
|
+
etc. and to tag the parts of the name with their type, e.g. first name, last name,
|
16
|
+
etc. Some extra heuristics and de-duplication are applied to reduce the number of
|
17
|
+
comparisons needed to find the best match.
|
18
|
+
"""
|
19
|
+
seen: Set[str] = set()
|
20
|
+
names: Set[Name] = set()
|
21
|
+
for name in entity.get_type_values(registry.name, matchable=True):
|
22
|
+
# Remove prefix like "Mr.", "Ms.", "Dr." from the name:
|
23
|
+
if type_tag == NameTypeTag.PER:
|
24
|
+
name = remove_person_prefixes(name)
|
25
|
+
|
26
|
+
form = prenormalize_name(name)
|
27
|
+
if type_tag in (NameTypeTag.ORG, NameTypeTag.ENT):
|
28
|
+
# Replace organization types with their canonical form, e.g. "Limited Liability Company" -> "LLC"
|
29
|
+
form = replace_org_types_compare(form, normalizer=prenormalize_name)
|
30
|
+
# Remove organization prefixes like "The" (actually that's it right now)
|
31
|
+
form = remove_org_prefixes(form)
|
32
|
+
|
33
|
+
if form in seen:
|
34
|
+
continue
|
35
|
+
seen.add(form)
|
36
|
+
sname = Name(name, form=form, tag=type_tag)
|
37
|
+
# tag name parts from properties:
|
38
|
+
for prop, tag in PROP_PART_TAGS:
|
39
|
+
for value in entity.get(prop, quiet=True):
|
40
|
+
sname.tag_text(prenormalize_name(value), tag)
|
41
|
+
|
42
|
+
# tag organization types and symbols:
|
43
|
+
if type_tag in (NameTypeTag.ORG, NameTypeTag.ENT):
|
44
|
+
tag_org_name(sname, normalize_name)
|
45
|
+
|
46
|
+
if type_tag == NameTypeTag.PER:
|
47
|
+
tag_person_name(sname, normalize_name, any_initials=is_query)
|
48
|
+
|
49
|
+
# TODO: should we tag phonetic tokens here?
|
50
|
+
names.add(sname)
|
51
|
+
return names
|