nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,104 @@
1
+ from itertools import product
2
+ from rigour.ids import LEI, ISIN, INN, OGRN, IMO, BIC
3
+ from rigour.ids import StrictFormat
4
+ from rigour.text.distance import levenshtein
5
+ from followthemoney import E, registry
6
+
7
+ from nomenklatura.matching.types import FtResult, ScoringConfig
8
+ from nomenklatura.matching.util import has_schema, type_pair
9
+ from nomenklatura.matching.compare.util import clean_map, CleanFunc
10
+
11
+
12
+ def _id_prop_match(
13
+ query: E,
14
+ result: E,
15
+ prop_name: str,
16
+ clean: CleanFunc = None,
17
+ ) -> bool:
18
+ """Check if a specific property identifier is shared by two entities."""
19
+ prop = query.schema.get(prop_name)
20
+ if prop is None:
21
+ return False
22
+ lv = clean_map(query.get(prop), clean=clean)
23
+ if not len(lv):
24
+ return False
25
+ rv_ = result.get_type_values(prop.type, matchable=True)
26
+ rv = clean_map(rv_, clean=clean)
27
+ common = lv.intersection(rv)
28
+ return len(common) > 0
29
+
30
+
31
+ def _bidi_id_prop_match(
32
+ query: E,
33
+ result: E,
34
+ prop_name: str,
35
+ clean: CleanFunc = None,
36
+ ) -> FtResult:
37
+ """Check if a specific property identifier is shared by two entities."""
38
+ if _id_prop_match(query, result, prop_name, clean=clean):
39
+ return FtResult(score=1.0, detail="Property match: %r" % prop_name)
40
+ if _id_prop_match(result, query, prop_name, clean=clean):
41
+ return FtResult(score=1.0, detail="Property match: %r" % prop_name)
42
+ return FtResult(score=0.0, detail="No match: %r" % prop_name)
43
+
44
+
45
+ def lei_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
46
+ """Two entities have the same Legal Entity Identifier."""
47
+ return _bidi_id_prop_match(query, result, "leiCode", LEI.normalize)
48
+
49
+
50
+ def bic_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
51
+ """Two entities have the same SWIFT BIC."""
52
+ return _bidi_id_prop_match(query, result, "swiftBic", BIC.normalize)
53
+
54
+
55
+ def ogrn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
56
+ """Two entities have the same Russian company registration (OGRN) code."""
57
+ return _bidi_id_prop_match(query, result, "ogrnCode", OGRN.normalize)
58
+
59
+
60
+ def inn_code_match(query: E, result: E, config: ScoringConfig) -> FtResult:
61
+ """Two entities have the same Russian tax identifier (INN)."""
62
+ return _bidi_id_prop_match(query, result, "innCode", INN.normalize)
63
+
64
+
65
+ def isin_security_match(query: E, result: E, config: ScoringConfig) -> FtResult:
66
+ """Two securities have the same ISIN."""
67
+ if not has_schema(query, result, "Security"):
68
+ return FtResult(score=0.0, detail="None of the entities is a security")
69
+ return _bidi_id_prop_match(query, result, "isin", ISIN.normalize)
70
+
71
+
72
+ def vessel_imo_mmsi_match(query: E, result: E, config: ScoringConfig) -> FtResult:
73
+ """Two vessels have the same IMO or MMSI identifier."""
74
+ imo_res = _bidi_id_prop_match(query, result, "imoNumber", IMO.normalize)
75
+ if imo_res.score > 0.0:
76
+ return imo_res
77
+ return _bidi_id_prop_match(query, result, "mmsi")
78
+
79
+
80
+ def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
81
+ """Two companies or organizations have different tax identifiers or registration
82
+ numbers."""
83
+ if not has_schema(query, result, "Organization"):
84
+ return FtResult(score=0.0, detail=None)
85
+ query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
86
+ query_ids = clean_map(query_ids_, StrictFormat.normalize)
87
+ result_ids = clean_map(result_ids_, StrictFormat.normalize)
88
+ if not len(query_ids) or not len(result_ids):
89
+ return FtResult(score=0.0, detail=None)
90
+ common = query_ids.intersection(result_ids)
91
+ if len(common) > 0:
92
+ return FtResult(score=0.0, detail=None)
93
+ max_ratio = 0.0
94
+ for query_id, result_id in product(query_ids, result_ids):
95
+ distance = levenshtein(query_id, result_id)
96
+ max_len = max(len(query_id), len(result_id))
97
+ ratio = 1.0 - (distance / float(max_len))
98
+ if ratio > 0.7:
99
+ max_ratio = max(max_ratio, ratio)
100
+ detail = "Mismatched identifiers: %s vs %s" % (
101
+ ", ".join(query_ids),
102
+ ", ".join(result_ids),
103
+ )
104
+ return FtResult(score=1 - max_ratio, detail=detail)
@@ -0,0 +1,76 @@
1
+ from typing import Dict, List
2
+
3
+ from nomenklatura.matching.types import Feature, HeuristicAlgorithm, FtResult
4
+ from nomenklatura.matching.compare.countries import country_mismatch
5
+ from nomenklatura.matching.compare.gender import gender_mismatch
6
+ from nomenklatura.matching.compare.identifiers import crypto_wallet_address
7
+ from nomenklatura.matching.compare.identifiers import identifier_match
8
+ from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
9
+ from nomenklatura.matching.compare.names import person_name_jaro_winkler
10
+ from nomenklatura.matching.compare.names import last_name_mismatch, name_literal_match
11
+ from nomenklatura.matching.compare.names import name_fingerprint_levenshtein
12
+ from nomenklatura.matching.compare.names import weak_alias_match
13
+ from nomenklatura.matching.compare.addresses import address_entity_match
14
+ from nomenklatura.matching.logic_v1.phonetic import person_name_phonetic_match
15
+ from nomenklatura.matching.logic_v1.phonetic import name_soundex_match
16
+ from nomenklatura.matching.logic_v1.phonetic import name_metaphone_match
17
+ from nomenklatura.matching.logic_v1.identifiers import bic_code_match
18
+ from nomenklatura.matching.logic_v1.identifiers import inn_code_match, ogrn_code_match
19
+ from nomenklatura.matching.logic_v1.identifiers import isin_security_match
20
+ from nomenklatura.matching.logic_v1.identifiers import lei_code_match
21
+ from nomenklatura.matching.logic_v1.identifiers import vessel_imo_mmsi_match
22
+ from nomenklatura.matching.logic_v1.identifiers import orgid_disjoint
23
+ from nomenklatura.matching.logic_v1.multi import numbers_mismatch
24
+ from nomenklatura.matching.util import FNUL
25
+
26
+
27
+ class LogicV1(HeuristicAlgorithm):
28
+ """A rule-based matching system that generates a set of basic scores via
29
+ name and identifier-based matching, and then qualifies that score using
30
+ supporting or contradicting features of the two entities."""
31
+
32
+ NAME = "logic-v1"
33
+ features = [
34
+ Feature(func=name_literal_match, weight=1.0),
35
+ Feature(func=FtResult.wrap(person_name_jaro_winkler), weight=0.8),
36
+ Feature(func=FtResult.wrap(person_name_phonetic_match), weight=0.9),
37
+ Feature(func=FtResult.wrap(name_fingerprint_levenshtein), weight=0.9),
38
+ # These are there so they can be enabled using custom weights:
39
+ Feature(func=FtResult.wrap(name_metaphone_match), weight=FNUL),
40
+ Feature(func=FtResult.wrap(name_soundex_match), weight=FNUL),
41
+ Feature(func=address_entity_match, weight=0.98),
42
+ Feature(func=crypto_wallet_address, weight=0.98),
43
+ Feature(func=isin_security_match, weight=0.98),
44
+ Feature(func=lei_code_match, weight=0.95),
45
+ Feature(func=ogrn_code_match, weight=0.95),
46
+ Feature(func=vessel_imo_mmsi_match, weight=0.95),
47
+ Feature(func=inn_code_match, weight=0.95),
48
+ Feature(func=bic_code_match, weight=0.95),
49
+ Feature(func=identifier_match, weight=0.85),
50
+ Feature(func=weak_alias_match, weight=0.8),
51
+ Feature(func=country_mismatch, weight=-0.2, qualifier=True),
52
+ Feature(func=FtResult.wrap(last_name_mismatch), weight=-0.2, qualifier=True),
53
+ Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
54
+ Feature(func=dob_day_disjoint, weight=-0.2, qualifier=True),
55
+ Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
56
+ Feature(func=orgid_disjoint, weight=-0.2, qualifier=True),
57
+ Feature(func=numbers_mismatch, weight=-0.1, qualifier=True),
58
+ ]
59
+
60
+ @classmethod
61
+ def compute_score(
62
+ cls, scores: Dict[str, float], weights: Dict[str, float]
63
+ ) -> float:
64
+ mains: List[float] = []
65
+ for feat in cls.features:
66
+ if feat.qualifier:
67
+ continue
68
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
69
+ mains.append(weight)
70
+ score = max(mains)
71
+ for feat in cls.features:
72
+ if not feat.qualifier:
73
+ continue
74
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
75
+ score += weight
76
+ return score
@@ -0,0 +1,21 @@
1
+ from followthemoney.proxy import E
2
+ from followthemoney.types import registry
3
+
4
+ from nomenklatura.matching.types import FtResult, ScoringConfig
5
+ from nomenklatura.matching.compare.util import extract_numbers
6
+ from nomenklatura.matching.util import type_pair, has_schema
7
+
8
+
9
+ def numbers_mismatch(query: E, result: E, config: ScoringConfig) -> FtResult:
10
+ """Find numbers in names and addresses and penalise different numbers."""
11
+ if has_schema(query, result, "Address"):
12
+ qv, rv = type_pair(query, result, registry.address)
13
+ else:
14
+ qv, rv = type_pair(query, result, registry.name)
15
+ qvn = extract_numbers(qv)
16
+ rvn = extract_numbers(rv)
17
+ base = min(len(qvn), len(rvn))
18
+ mismatch = len(qvn.difference(rvn))
19
+ # print("numbers_mismatch", mismatch, base, qvn, rvn)
20
+ score = float(mismatch) / float(max(1, base))
21
+ return FtResult(score=score, detail="Mismatching numbers: %s" % mismatch)
@@ -0,0 +1,142 @@
1
+ from functools import cached_property
2
+ from typing import List, Optional
3
+ from itertools import product
4
+ from normality import ascii_text
5
+ from followthemoney.proxy import E
6
+ from followthemoney.types import registry
7
+ from rigour.text.scripts import can_latinize
8
+ from rigour.text.distance import is_levenshtein_plausible
9
+ from rigour.text.phonetics import metaphone, soundex
10
+ from rigour.names import tokenize_name
11
+ from rigour.util import list_intersection
12
+
13
+ from nomenklatura.matching.util import type_pair, has_schema
14
+ from nomenklatura.matching.compat import fingerprint_name, name_words
15
+
16
+
17
+ class NameTokenPhonetic:
18
+ def __init__(self, token: str):
19
+ self.token = token
20
+ self.ascii = ascii_text(token) if can_latinize(token) else None
21
+
22
+ @cached_property
23
+ def metaphone(self) -> Optional[str]:
24
+ if self.ascii is not None:
25
+ phoneme = metaphone(self.ascii)
26
+ if len(phoneme) >= 3:
27
+ return phoneme
28
+ return None
29
+
30
+ # def __repr__(self) -> str:
31
+ # return f"<NameTokenPhonetic {self.token!r}, {self.ascii!r}, {self.metaphone!r}>"
32
+
33
+ @classmethod
34
+ def from_name(cls, name: str) -> List["NameTokenPhonetic"]:
35
+ tokens = tokenize_name(name.lower(), token_min_length=2)
36
+ return [cls(token) for token in tokens]
37
+
38
+
39
+ def metaphone_token(token: str) -> str:
40
+ if token.isalpha() and len(token) > 1:
41
+ out = metaphone(token)
42
+ # doesn't handle non-ascii characters
43
+ if len(out) >= 3:
44
+ return out
45
+ return token.upper()
46
+
47
+
48
+ def soundex_token(token: str) -> str:
49
+ if token.isalpha() and len(token) > 1:
50
+ out = soundex(token)
51
+ # doesn't handle non-ascii characters
52
+ if len(out):
53
+ return out
54
+ return token.upper()
55
+
56
+
57
+ def compare_parts_phonetic(left: NameTokenPhonetic, right: NameTokenPhonetic) -> bool:
58
+ if left.metaphone is None or right.metaphone is None:
59
+ return left.ascii == right.ascii
60
+ if (
61
+ left.metaphone == right.metaphone
62
+ and left.ascii is not None
63
+ and right.ascii is not None
64
+ ):
65
+ # Secondary check for Levenshtein distance:
66
+ if is_levenshtein_plausible(left.ascii, right.ascii):
67
+ return True
68
+ return False
69
+
70
+
71
+ def _clean_phonetic_entity(original: str) -> Optional[str]:
72
+ """Normalize a legal entity name without transliteration."""
73
+ if not can_latinize(original):
74
+ return None
75
+ return fingerprint_name(original)
76
+
77
+
78
+ def _token_names_compare(
79
+ query_names: List[List[str]], result_names: List[List[str]]
80
+ ) -> float:
81
+ score = 0.0
82
+ for q, r in product(query_names, result_names):
83
+ # length = max(2.0, (len(q) + len(r)) / 2.0)
84
+ length = max(2.0, len(q))
85
+ combo = len(list_intersection(q, r)) / float(length)
86
+ score = max(score, combo)
87
+ return score
88
+
89
+
90
+ def person_name_phonetic_match(query: E, result: E) -> float:
91
+ """Two persons have similar names, using a phonetic algorithm."""
92
+ if not has_schema(query, result, "Person"):
93
+ return 0.0
94
+ query_names_, result_names_ = type_pair(query, result, registry.name)
95
+ query_parts = [NameTokenPhonetic.from_name(n) for n in query_names_]
96
+ result_parts = [NameTokenPhonetic.from_name(n) for n in result_names_]
97
+ score = 0.0
98
+ for q, r in product(query_parts, result_parts):
99
+ if len(q) == 0:
100
+ continue
101
+ matches = list(r)
102
+ matched = 0
103
+ for part in q:
104
+ for other in matches:
105
+ if compare_parts_phonetic(part, other):
106
+ matches.remove(other)
107
+ matched += 1
108
+ break
109
+ score = max(score, matched / float(len(q)))
110
+ return score
111
+
112
+
113
+ def _metaphone_tokens(token: str) -> List[str]:
114
+ words: List[str] = []
115
+ for word in name_words(_clean_phonetic_entity(token), min_length=2):
116
+ words.append(metaphone_token(word))
117
+ return words
118
+
119
+
120
+ def name_metaphone_match(query: E, result: E) -> float:
121
+ """Two entities (person and non-person) have similar names, using the metaphone
122
+ algorithm."""
123
+ query_names_, result_names_ = type_pair(query, result, registry.name)
124
+ query_names = [_metaphone_tokens(n) for n in query_names_]
125
+ result_names = [_metaphone_tokens(n) for n in result_names_]
126
+ return _token_names_compare(query_names, result_names)
127
+
128
+
129
+ def _soundex_tokens(token: str) -> List[str]:
130
+ words: List[str] = []
131
+ for word in name_words(_clean_phonetic_entity(token), min_length=2):
132
+ words.append(soundex_token(word))
133
+ return words
134
+
135
+
136
+ def name_soundex_match(query: E, result: E) -> float:
137
+ """Two entities (person and non-person) have similar names, using the soundex
138
+ algorithm."""
139
+ query_names_, result_names_ = type_pair(query, result, registry.name)
140
+ query_names = [_soundex_tokens(n) for n in query_names_]
141
+ result_names = [_soundex_tokens(n) for n in result_names_]
142
+ return _token_names_compare(query_names, result_names)
File without changes
@@ -0,0 +1,124 @@
1
+ from typing import Set, Type
2
+
3
+ from rigour.ids import get_identifier_format, IdentifierFormat
4
+ from followthemoney import model
5
+ from followthemoney.property import Property
6
+ from followthemoney.types import registry
7
+ from followthemoney.proxy import EntityProxy
8
+
9
+ from nomenklatura.matching.types import FtResult, ScoringConfig
10
+
11
+
12
+ def _format_normalize(
13
+ format: Type[IdentifierFormat], entity: EntityProxy, prop: Property
14
+ ) -> Set[str]:
15
+ values: Set[str] = set()
16
+ for value in entity.get(prop, quiet=True):
17
+ norm_value = format.normalize(value)
18
+ if norm_value is not None:
19
+ values.add(norm_value)
20
+ return values
21
+
22
+
23
+ def _identifier_format_match(
24
+ format_name: str, query: EntityProxy, result: EntityProxy
25
+ ) -> FtResult:
26
+ """Check if the identifier format is the same for two entities."""
27
+ schema = model.common_schema(query.schema, result.schema)
28
+ format = get_identifier_format(format_name)
29
+ query_identifiers: Set[str] = set()
30
+ query_format: Set[str] = set()
31
+ result_identifiers: Set[str] = set()
32
+ result_format: Set[str] = set()
33
+ for prop in schema.properties.values():
34
+ if prop.type != registry.identifier or not prop.matchable:
35
+ continue
36
+ if prop.format is not None and get_identifier_format(prop.format) != format:
37
+ continue
38
+ query_values = _format_normalize(format, query, prop)
39
+ query_identifiers.update(query_values)
40
+ result_values = _format_normalize(format, result, prop)
41
+ result_identifiers.update(result_values)
42
+ if prop.format is not None and get_identifier_format(prop.format) == format:
43
+ query_format.update(query_values)
44
+ result_format.update(result_values)
45
+ left_common = query_format.intersection(result_identifiers)
46
+ if len(left_common) > 0:
47
+ detail = f"Matched {format.TITLE}: {', '.join(left_common)}"
48
+ return FtResult(score=1.0, detail=detail)
49
+ right_common = result_format.intersection(query_identifiers)
50
+ if len(right_common) > 0:
51
+ detail = f"Matched {format.TITLE}: {', '.join(right_common)}"
52
+ return FtResult(score=1.0, detail=detail)
53
+ if format.STRONG:
54
+ non_common = query_identifiers.intersection(result_identifiers)
55
+ if len(non_common) > 0:
56
+ detail = f"Out-of-format match: {', '.join(non_common)}"
57
+ return FtResult(score=0.8, detail=detail)
58
+ return FtResult(score=0.0, detail=f"No {format.TITLE} match")
59
+
60
+
61
+ def lei_code_match(
62
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
63
+ ) -> FtResult:
64
+ """Two entities have the same Legal Entity Identifier."""
65
+ return _identifier_format_match("lei", query, result)
66
+
67
+
68
+ def bic_code_match(
69
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
70
+ ) -> FtResult:
71
+ """Two entities have the same SWIFT BIC."""
72
+ return _identifier_format_match("bic", query, result)
73
+
74
+
75
+ def ogrn_code_match(
76
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
77
+ ) -> FtResult:
78
+ """Two entities have the same Russian company registration (OGRN) code."""
79
+ return _identifier_format_match("ogrn", query, result)
80
+
81
+
82
+ def inn_code_match(
83
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
84
+ ) -> FtResult:
85
+ """Two entities have the same Russian tax identifier (INN)."""
86
+ return _identifier_format_match("inn", query, result)
87
+
88
+
89
+ def uei_code_match(
90
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
91
+ ) -> FtResult:
92
+ """Two entities have the same US Unique Entity ID (UEI)."""
93
+ return _identifier_format_match("uei", query, result)
94
+
95
+
96
+ def npi_code_match(
97
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
98
+ ) -> FtResult:
99
+ """Two entities have the same US National Provider Identifier (NPI)."""
100
+ return _identifier_format_match("npi", query, result)
101
+
102
+
103
+ def isin_security_match(
104
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
105
+ ) -> FtResult:
106
+ """Two securities have the same ISIN."""
107
+ # if not has_schema(query, result, "Security"):
108
+ # return 0.0
109
+ return _identifier_format_match("isin", query, result)
110
+
111
+
112
+ def vessel_imo_mmsi_match(
113
+ query: EntityProxy, result: EntityProxy, config: ScoringConfig
114
+ ) -> FtResult:
115
+ """Two vessels have the same IMO or MMSI identifier."""
116
+ imo_res = _identifier_format_match("imo", query, result)
117
+ if imo_res.score > 0.0:
118
+ return imo_res
119
+ query_mmsis = query.get("mmsi", quiet=True)
120
+ result_mmsis = result.get("mmsi", quiet=True)
121
+ score = registry.identifier.compare_sets(query_mmsis, result_mmsis)
122
+ if score > 0.0:
123
+ return FtResult(score=score, detail="MMSI match")
124
+ return FtResult(score=score, detail="No IMO or MMSI match")
@@ -0,0 +1,98 @@
1
+ from typing import Dict, List
2
+
3
+ from nomenklatura.matching.types import Feature, HeuristicAlgorithm
4
+ from nomenklatura.matching.types import ConfigVar, ConfigVarType
5
+ from nomenklatura.matching.compare.countries import country_mismatch
6
+ from nomenklatura.matching.compare.gender import gender_mismatch
7
+ from nomenklatura.matching.compare.identifiers import crypto_wallet_address
8
+ from nomenklatura.matching.compare.identifiers import identifier_match
9
+ from nomenklatura.matching.compare.dates import dob_day_disjoint, dob_year_disjoint
10
+ from nomenklatura.matching.compare.names import weak_alias_match
11
+ from nomenklatura.matching.compare.addresses import address_entity_match
12
+ from nomenklatura.matching.compare.addresses import address_prop_match
13
+ from nomenklatura.matching.logic_v2.names.match import name_match
14
+ from nomenklatura.matching.logic_v2.identifiers import bic_code_match
15
+ from nomenklatura.matching.logic_v2.identifiers import inn_code_match, ogrn_code_match
16
+ from nomenklatura.matching.logic_v2.identifiers import isin_security_match
17
+ from nomenklatura.matching.logic_v2.identifiers import lei_code_match
18
+ from nomenklatura.matching.logic_v2.identifiers import vessel_imo_mmsi_match
19
+ from nomenklatura.matching.logic_v2.identifiers import uei_code_match
20
+ from nomenklatura.matching.logic_v2.identifiers import npi_code_match
21
+ from nomenklatura.matching.util import FNUL
22
+
23
+
24
+ class LogicV2(HeuristicAlgorithm):
25
+ """A rule-based matching system that generates a set of basic scores via
26
+ name and identifier-based matching, and then qualifies that score using
27
+ supporting or contradicting features of the two entities. Version 2 uses
28
+ a different set of features and consolidates name matching into a single
29
+ feature, which uses a versatile and complex name matching algorithm."""
30
+
31
+ NAME = "logic-v2"
32
+ features = [
33
+ Feature(func=name_match, weight=1.0),
34
+ Feature(func=address_entity_match, weight=0.98),
35
+ Feature(func=crypto_wallet_address, weight=0.98),
36
+ Feature(func=isin_security_match, weight=0.98),
37
+ Feature(func=lei_code_match, weight=0.95),
38
+ Feature(func=ogrn_code_match, weight=0.95),
39
+ Feature(func=vessel_imo_mmsi_match, weight=0.95),
40
+ Feature(func=inn_code_match, weight=0.95),
41
+ Feature(func=bic_code_match, weight=0.95),
42
+ Feature(func=uei_code_match, weight=0.95),
43
+ Feature(func=npi_code_match, weight=0.95),
44
+ Feature(func=identifier_match, weight=0.85),
45
+ Feature(func=weak_alias_match, weight=0.8),
46
+ Feature(func=address_prop_match, weight=0.2, qualifier=True),
47
+ Feature(func=country_mismatch, weight=-0.2, qualifier=True),
48
+ Feature(func=dob_year_disjoint, weight=-0.15, qualifier=True),
49
+ Feature(func=dob_day_disjoint, weight=-0.25, qualifier=True),
50
+ Feature(func=gender_mismatch, weight=-0.2, qualifier=True),
51
+ ]
52
+ CONFIG = {
53
+ "nm_number_mismatch": ConfigVar(
54
+ type=ConfigVarType.FLOAT,
55
+ description="Penalty for mismatching numbers in object or company names.",
56
+ default=0.3,
57
+ ),
58
+ "nm_extra_query_name": ConfigVar(
59
+ type=ConfigVarType.FLOAT,
60
+ description="Weight for name parts in the query not matched to the result.",
61
+ default=0.8,
62
+ ),
63
+ "nm_extra_result_name": ConfigVar(
64
+ type=ConfigVarType.FLOAT,
65
+ description="Weight for name parts in the result not matched to the query.",
66
+ default=0.2,
67
+ ),
68
+ "nm_family_name_weight": ConfigVar(
69
+ type=ConfigVarType.FLOAT,
70
+ description="Extra weight multiplier for family name in person matches (John Smith vs. John Gruber is clearly distinct).",
71
+ default=1.3,
72
+ ),
73
+ "nm_fuzzy_cutoff_factor": ConfigVar(
74
+ type=ConfigVarType.FLOAT,
75
+ description="Extra factor for when a fuzzy match is triggered in name matching. "
76
+ "Below a certain threshold, a fuzzy match is considered as a non-match (score = 0.0). "
77
+ "Adjusting this multiplier will raise this threshold, making a fuzzy match trigger more leniently.",
78
+ default=1.0,
79
+ ),
80
+ }
81
+
82
+ @classmethod
83
+ def compute_score(
84
+ cls, scores: Dict[str, float], weights: Dict[str, float]
85
+ ) -> float:
86
+ mains: List[float] = []
87
+ for feat in cls.features:
88
+ if feat.qualifier:
89
+ continue
90
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
91
+ mains.append(weight)
92
+ score = max(mains)
93
+ for feat in cls.features:
94
+ if not feat.qualifier:
95
+ continue
96
+ weight = scores.get(feat.name, FNUL) * weights.get(feat.name, FNUL)
97
+ score += weight
98
+ return score
@@ -0,0 +1,3 @@
1
+ from nomenklatura.matching.logic_v2.names.match import name_match
2
+
3
+ __all__ = ["name_match"]
@@ -0,0 +1,51 @@
1
+ from typing import Set
2
+ from rigour.names import NameTypeTag, Name
3
+ from rigour.names import replace_org_types_compare, prenormalize_name
4
+ from rigour.names import remove_person_prefixes, remove_org_prefixes
5
+ from rigour.names import tag_org_name, tag_person_name, normalize_name
6
+ from followthemoney import registry, EntityProxy
7
+ from followthemoney.names import PROP_PART_TAGS
8
+
9
+
10
+ def entity_names(
11
+ type_tag: NameTypeTag, entity: EntityProxy, is_query: bool = False
12
+ ) -> Set[Name]:
13
+ """This will transform the entity into a set of names with tags applied. The idea
14
+ is to tag the names with the type of entity they are, e.g. person, organization,
15
+ etc. and to tag the parts of the name with their type, e.g. first name, last name,
16
+ etc. Some extra heuristics and de-duplication are applied to reduce the number of
17
+ comparisons needed to find the best match.
18
+ """
19
+ seen: Set[str] = set()
20
+ names: Set[Name] = set()
21
+ for name in entity.get_type_values(registry.name, matchable=True):
22
+ # Remove prefix like "Mr.", "Ms.", "Dr." from the name:
23
+ if type_tag == NameTypeTag.PER:
24
+ name = remove_person_prefixes(name)
25
+
26
+ form = prenormalize_name(name)
27
+ if type_tag in (NameTypeTag.ORG, NameTypeTag.ENT):
28
+ # Replace organization types with their canonical form, e.g. "Limited Liability Company" -> "LLC"
29
+ form = replace_org_types_compare(form, normalizer=prenormalize_name)
30
+ # Remove organization prefixes like "The" (actually that's it right now)
31
+ form = remove_org_prefixes(form)
32
+
33
+ if form in seen:
34
+ continue
35
+ seen.add(form)
36
+ sname = Name(name, form=form, tag=type_tag)
37
+ # tag name parts from properties:
38
+ for prop, tag in PROP_PART_TAGS:
39
+ for value in entity.get(prop, quiet=True):
40
+ sname.tag_text(prenormalize_name(value), tag)
41
+
42
+ # tag organization types and symbols:
43
+ if type_tag in (NameTypeTag.ORG, NameTypeTag.ENT):
44
+ tag_org_name(sname, normalize_name)
45
+
46
+ if type_tag == NameTypeTag.PER:
47
+ tag_person_name(sname, normalize_name, any_initials=is_query)
48
+
49
+ # TODO: should we tag phonetic tokens here?
50
+ names.add(sname)
51
+ return names