nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
from itertools import product
|
2
|
+
from typing import Iterable, Set
|
3
|
+
from prefixdate import Precision
|
4
|
+
from followthemoney.proxy import E
|
5
|
+
from followthemoney.types import registry
|
6
|
+
from rigour.text.distance import levenshtein
|
7
|
+
from rigour.ids import StrictFormat
|
8
|
+
|
9
|
+
from nomenklatura.matching.compare.util import clean_map
|
10
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
11
|
+
from nomenklatura.matching.util import has_schema, props_pair, type_pair
|
12
|
+
|
13
|
+
|
14
|
+
def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
|
15
|
+
dates = set()
|
16
|
+
for value in values:
|
17
|
+
if len(value) >= precision.value:
|
18
|
+
dates.add(value[: precision.value])
|
19
|
+
return dates
|
20
|
+
|
21
|
+
|
22
|
+
def _flip_day_month(value: str) -> str:
|
23
|
+
# This is such a common mistake we want to accomodate it.
|
24
|
+
year, month, day = value.split("-", 2)
|
25
|
+
return f"{year}-{day}-{month}"
|
26
|
+
|
27
|
+
|
28
|
+
def dob_day_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
|
29
|
+
"""The birth date of the two entities is not the same."""
|
30
|
+
query_dates, result_dates = props_pair(query, result, ["birthDate"])
|
31
|
+
if len(query_dates) == 0 or len(result_dates) == 0:
|
32
|
+
return FtResult(score=0.0, detail="No birth dates provided")
|
33
|
+
result_days = _dates_precision(result_dates, Precision.DAY)
|
34
|
+
query_days = _dates_precision(query_dates, Precision.DAY)
|
35
|
+
if len(result_days) == 0 or len(query_days) == 0:
|
36
|
+
return FtResult(score=0.0, detail="Birth days don't include day precision")
|
37
|
+
overlap = query_days.intersection(result_days)
|
38
|
+
if len(overlap) > 0:
|
39
|
+
return FtResult(score=0.0, detail=f"Birth day match: {', '.join(overlap)}")
|
40
|
+
query_flipped = set([_flip_day_month(d) for d in query_days])
|
41
|
+
overlap = query_flipped.intersection(result_days)
|
42
|
+
if len(overlap) > 0:
|
43
|
+
detail = f"Birth day flipped match: {', '.join(overlap)}"
|
44
|
+
return FtResult(score=0.5, detail=detail)
|
45
|
+
return FtResult(score=1.0, detail="Birth day mis-match")
|
46
|
+
|
47
|
+
|
48
|
+
def dob_year_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
|
49
|
+
"""The birth date of the two entities is not the same."""
|
50
|
+
query_dates, result_dates = props_pair(query, result, ["birthDate"])
|
51
|
+
query_years = _dates_precision(query_dates, Precision.YEAR)
|
52
|
+
result_years = _dates_precision(result_dates, Precision.YEAR)
|
53
|
+
if len(query_years) == 0 or len(result_years) == 0:
|
54
|
+
return FtResult(score=0.0, detail="No birth years provided")
|
55
|
+
overlap = query_years.intersection(result_years)
|
56
|
+
if len(overlap) > 0:
|
57
|
+
detail = f"Birth year match: {', '.join(overlap)}"
|
58
|
+
return FtResult(score=0.0, detail=detail)
|
59
|
+
return FtResult(score=1.0, detail="Birth year mis-match")
|
60
|
+
|
61
|
+
|
62
|
+
def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
|
63
|
+
"""Two companies or organizations have different tax identifiers or registration
|
64
|
+
numbers."""
|
65
|
+
if not has_schema(query, result, "Organization"):
|
66
|
+
return FtResult(score=0.0, detail="Neither entity is an organization")
|
67
|
+
query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
|
68
|
+
query_ids = clean_map(query_ids_, StrictFormat.normalize)
|
69
|
+
result_ids = clean_map(result_ids_, StrictFormat.normalize)
|
70
|
+
if not len(query_ids) or not len(result_ids):
|
71
|
+
return FtResult(score=0.0, detail="Neither entity has identifiers")
|
72
|
+
common = query_ids.intersection(result_ids)
|
73
|
+
if len(common) > 0:
|
74
|
+
return FtResult(score=0.0, detail="Common identifiers: %s" % ", ".join(common))
|
75
|
+
max_ratio = 0.0
|
76
|
+
for query_id, result_id in product(query_ids, result_ids):
|
77
|
+
distance = levenshtein(query_id, result_id)
|
78
|
+
max_len = max(len(query_id), len(result_id))
|
79
|
+
ratio = 1.0 - (distance / float(max_len))
|
80
|
+
if ratio > 0.7:
|
81
|
+
max_ratio = max(max_ratio, ratio)
|
82
|
+
detail = "Mismatched identifiers: %s vs %s" % (
|
83
|
+
", ".join(query_ids),
|
84
|
+
", ".join(result_ids),
|
85
|
+
)
|
86
|
+
return FtResult(score=1 - max_ratio, detail=detail)
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from typing import Dict
|
2
|
+
|
3
|
+
from nomenklatura.matching.types import Feature, HeuristicAlgorithm
|
4
|
+
from nomenklatura.matching.compare.countries import country_mismatch
|
5
|
+
from nomenklatura.matching.compare.gender import gender_mismatch
|
6
|
+
from nomenklatura.matching.name_based.misc import orgid_disjoint
|
7
|
+
from nomenklatura.matching.name_based.misc import dob_day_disjoint, dob_year_disjoint
|
8
|
+
from nomenklatura.matching.name_based.names import jaro_name_parts
|
9
|
+
from nomenklatura.matching.name_based.names import soundex_name_parts
|
10
|
+
|
11
|
+
|
12
|
+
class NameMatcher(HeuristicAlgorithm):
|
13
|
+
"""An algorithm that matches on entity name, using phonetic comparisons and edit
|
14
|
+
distance to generate potential matches. This implementation is vaguely based on
|
15
|
+
the behaviour proposed by the US OFAC documentation (FAQ #249)."""
|
16
|
+
|
17
|
+
# Try to re-produce results from: https://sanctionssearch.ofac.treas.gov/
|
18
|
+
# cf. https://ofac.treasury.gov/faqs/topic/1636
|
19
|
+
|
20
|
+
NAME = "name-based"
|
21
|
+
features = [
|
22
|
+
Feature(func=jaro_name_parts, weight=0.5),
|
23
|
+
Feature(func=soundex_name_parts, weight=0.5),
|
24
|
+
]
|
25
|
+
|
26
|
+
@classmethod
|
27
|
+
def compute_score(
|
28
|
+
cls, scores: Dict[str, float], weights: Dict[str, float]
|
29
|
+
) -> float:
|
30
|
+
score = 0.0
|
31
|
+
for feat in cls.features:
|
32
|
+
score += scores.get(feat.name, 0.0) * weights.get(feat.name, 0.0)
|
33
|
+
return score
|
34
|
+
|
35
|
+
|
36
|
+
class NameQualifiedMatcher(HeuristicAlgorithm):
|
37
|
+
"""Same as the name-based algorithm, but scores will be reduced if a mis-match
|
38
|
+
of birth dates and nationalities is found for persons, or different
|
39
|
+
tax/registration identifiers are included for organizations and companies."""
|
40
|
+
|
41
|
+
NAME = "name-qualified"
|
42
|
+
features = [
|
43
|
+
Feature(func=jaro_name_parts, weight=0.5),
|
44
|
+
Feature(func=soundex_name_parts, weight=0.5),
|
45
|
+
Feature(func=country_mismatch, weight=-0.1, qualifier=True),
|
46
|
+
Feature(func=dob_year_disjoint, weight=-0.1, qualifier=True),
|
47
|
+
Feature(func=dob_day_disjoint, weight=-0.15, qualifier=True),
|
48
|
+
Feature(func=gender_mismatch, weight=-0.1, qualifier=True),
|
49
|
+
Feature(func=orgid_disjoint, weight=-0.1, qualifier=True),
|
50
|
+
]
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
def compute_score(
|
54
|
+
cls, scores: Dict[str, float], weights: Dict[str, float]
|
55
|
+
) -> float:
|
56
|
+
score = 0.0
|
57
|
+
for feat in cls.features:
|
58
|
+
score += scores.get(feat.name, 0.0) * weights.get(feat.name, 0.0)
|
59
|
+
return score
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from typing import List, Optional, Tuple
|
2
|
+
from followthemoney.proxy import E
|
3
|
+
from followthemoney.types import registry
|
4
|
+
from rigour.text.distance import jaro_winkler
|
5
|
+
from rigour.text.phonetics import soundex
|
6
|
+
|
7
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
8
|
+
from nomenklatura.matching.util import type_pair
|
9
|
+
from nomenklatura.matching.compat import names_word_list
|
10
|
+
|
11
|
+
|
12
|
+
def _soundex_token(token: str) -> str:
|
13
|
+
if token.isalpha() and len(token) > 1:
|
14
|
+
out = soundex(token)
|
15
|
+
# doesn't handle non-ascii characters
|
16
|
+
if len(out):
|
17
|
+
return out
|
18
|
+
return token.upper()
|
19
|
+
|
20
|
+
|
21
|
+
def soundex_name_parts(query: E, result: E, config: ScoringConfig) -> FtResult:
|
22
|
+
"""Compare two sets of name parts using the phonetic matching."""
|
23
|
+
query_names_, result_names_ = type_pair(query, result, registry.name)
|
24
|
+
query_soundex = set([_soundex_token(p) for p in names_word_list(query_names_)])
|
25
|
+
result_soundex = set([_soundex_token(p) for p in names_word_list(result_names_)])
|
26
|
+
overlap = query_soundex.intersection(result_soundex)
|
27
|
+
if len(overlap) == 0:
|
28
|
+
return FtResult(score=0.0, detail=None)
|
29
|
+
min_len = min(len(query_soundex), len(result_soundex))
|
30
|
+
score = len(overlap) / float(max(1.0, min_len))
|
31
|
+
detail = f"Matched {len(overlap)} tokens: {', '.join(overlap)}"
|
32
|
+
return FtResult(score=score, detail=detail)
|
33
|
+
|
34
|
+
|
35
|
+
def jaro_name_parts(query: E, result: E, config: ScoringConfig) -> FtResult:
|
36
|
+
"""Compare two sets of name parts using the Jaro-Winkler string similarity
|
37
|
+
algorithm."""
|
38
|
+
query_names_, result_names_ = type_pair(query, result, registry.name)
|
39
|
+
result_parts = set(names_word_list(result_names_))
|
40
|
+
similiarities: List[float] = []
|
41
|
+
tokens: List[Tuple[str, str]] = []
|
42
|
+
for part in set(names_word_list(query_names_)):
|
43
|
+
best = 0.0
|
44
|
+
best_token: Optional[str] = None
|
45
|
+
|
46
|
+
for other in result_parts:
|
47
|
+
part_similarity = jaro_winkler(part, other)
|
48
|
+
if part_similarity > 0.5 and part_similarity > best:
|
49
|
+
best = part_similarity
|
50
|
+
best_token = other
|
51
|
+
|
52
|
+
similiarities.append(best)
|
53
|
+
if best_token is not None:
|
54
|
+
tokens.append((part, best_token))
|
55
|
+
if len(similiarities) == 0:
|
56
|
+
return FtResult(score=0.0, detail=None)
|
57
|
+
score = sum(similiarities) / float(max(1.0, len(similiarities)))
|
58
|
+
mapping = ", ".join(f"{a} -> {b}" for a, b in tokens)
|
59
|
+
return FtResult(score=score, detail=f"Matched {len(tokens)} tokens: {mapping}")
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Generator, Dict, Any
|
3
|
+
from followthemoney.proxy import EntityProxy
|
4
|
+
from followthemoney.util import PathLike
|
5
|
+
|
6
|
+
from nomenklatura.judgement import Judgement
|
7
|
+
|
8
|
+
|
9
|
+
class JudgedPair(object):
|
10
|
+
"""A pair of two entities which have been judged to be the same
|
11
|
+
(or not) by a user."""
|
12
|
+
|
13
|
+
__slots__ = ("left", "right", "weight", "judgement")
|
14
|
+
|
15
|
+
def __init__(
|
16
|
+
self, left: EntityProxy, right: EntityProxy, judgement: Judgement
|
17
|
+
) -> None:
|
18
|
+
self.left = left
|
19
|
+
self.right = right
|
20
|
+
self.judgement = judgement
|
21
|
+
self.weight = 0.0
|
22
|
+
|
23
|
+
def to_dict(self) -> Dict[str, Any]:
|
24
|
+
return {
|
25
|
+
"left": self.left.to_dict(),
|
26
|
+
"right": self.right.to_dict(),
|
27
|
+
"judgement": self.judgement.value,
|
28
|
+
"weight": self.weight,
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
def read_pairs(pairs_file: PathLike) -> Generator[JudgedPair, None, None]:
|
33
|
+
"""Read judgement pairs (training data) from a JSON file."""
|
34
|
+
with open(pairs_file, "r") as fh:
|
35
|
+
while line := fh.readline():
|
36
|
+
data = json.loads(line)
|
37
|
+
left_entity = EntityProxy.from_dict(data["left"])
|
38
|
+
right_entity = EntityProxy.from_dict(data["right"])
|
39
|
+
judgement = Judgement(data["judgement"])
|
40
|
+
if judgement not in (Judgement.POSITIVE, Judgement.NEGATIVE):
|
41
|
+
continue
|
42
|
+
yield JudgedPair(left_entity, right_entity, judgement)
|
File without changes
|
@@ -0,0 +1,75 @@
|
|
1
|
+
from followthemoney.proxy import E
|
2
|
+
from followthemoney.types import registry
|
3
|
+
|
4
|
+
from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
|
5
|
+
from nomenklatura.matching.compare.util import has_overlap, extract_numbers, is_disjoint
|
6
|
+
from nomenklatura.matching.util import props_pair, type_pair
|
7
|
+
from nomenklatura.matching.util import max_in_sets, has_schema
|
8
|
+
from nomenklatura.matching.compat import clean_name_ascii
|
9
|
+
|
10
|
+
|
11
|
+
def birth_place(query: E, result: E) -> float:
|
12
|
+
"""Same place of birth."""
|
13
|
+
lv, rv = tokenize_pair(props_pair(query, result, ["birthPlace"]))
|
14
|
+
tokens = min(len(lv), len(rv))
|
15
|
+
return float(len(lv.intersection(rv))) / float(max(2.0, tokens))
|
16
|
+
|
17
|
+
|
18
|
+
def address_match(query: E, result: E) -> float:
|
19
|
+
"""Text similarity between addresses."""
|
20
|
+
lv, rv = type_pair(query, result, registry.address)
|
21
|
+
lvn = [clean_name_ascii(v) for v in lv]
|
22
|
+
rvn = [clean_name_ascii(v) for v in rv]
|
23
|
+
return max_in_sets(lvn, rvn, compare_levenshtein)
|
24
|
+
|
25
|
+
|
26
|
+
def address_numbers(query: E, result: E) -> float:
|
27
|
+
"""Find if names contain numbers, score if the numbers are different."""
|
28
|
+
lv, rv = type_pair(query, result, registry.address)
|
29
|
+
lvn = extract_numbers(lv)
|
30
|
+
rvn = extract_numbers(rv)
|
31
|
+
common = len(lvn.intersection(rvn))
|
32
|
+
disjoint = len(lvn.difference(rvn))
|
33
|
+
return common - disjoint
|
34
|
+
|
35
|
+
|
36
|
+
def phone_match(query: E, result: E) -> float:
|
37
|
+
"""Matching phone numbers between the two entities."""
|
38
|
+
lv, rv = type_pair(query, result, registry.phone)
|
39
|
+
return 1.0 if has_overlap(lv, rv) else 0.0
|
40
|
+
|
41
|
+
|
42
|
+
def email_match(query: E, result: E) -> float:
|
43
|
+
"""Matching email addresses between the two entities."""
|
44
|
+
lv, rv = type_pair(query, result, registry.email)
|
45
|
+
return 1.0 if has_overlap(lv, rv) else 0.0
|
46
|
+
|
47
|
+
|
48
|
+
def identifier_match(query: E, result: E) -> float:
|
49
|
+
"""Matching identifiers (e.g. passports, national ID cards, registration or
|
50
|
+
tax numbers) between the two entities."""
|
51
|
+
if has_schema(query, result, "Organization"):
|
52
|
+
return 0.0
|
53
|
+
lv, rv = type_pair(query, result, registry.identifier)
|
54
|
+
return 1.0 if has_overlap(lv, rv) else 0.0
|
55
|
+
|
56
|
+
|
57
|
+
def org_identifier_match(query: E, result: E) -> float:
|
58
|
+
"""Matching identifiers (e.g. registration or tax numbers) between two
|
59
|
+
organizations or companies."""
|
60
|
+
if not has_schema(query, result, "Organization"):
|
61
|
+
return 0.0
|
62
|
+
lv, rv = type_pair(query, result, registry.identifier)
|
63
|
+
return 1.0 if has_overlap(lv, rv) else 0.0
|
64
|
+
|
65
|
+
|
66
|
+
def gender_mismatch(query: E, result: E) -> float:
|
67
|
+
"""Both entities have a different gender associated with them."""
|
68
|
+
qv, rv = props_pair(query, result, ["gender"])
|
69
|
+
return 1.0 if is_disjoint(qv, rv) else 0.0
|
70
|
+
|
71
|
+
|
72
|
+
def country_mismatch(query: E, result: E) -> float:
|
73
|
+
"""Both entities are linked to different countries."""
|
74
|
+
qv, rv = type_pair(query, result, registry.country)
|
75
|
+
return 1.0 if is_disjoint(qv, rv) else 0.0
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import pickle
|
2
|
+
import numpy as np
|
3
|
+
from typing import List, Dict, Tuple, cast
|
4
|
+
from functools import cache
|
5
|
+
from sklearn.pipeline import Pipeline # type: ignore
|
6
|
+
from followthemoney.proxy import E
|
7
|
+
|
8
|
+
from nomenklatura.matching.regression_v1.names import first_name_match
|
9
|
+
from nomenklatura.matching.regression_v1.names import family_name_match
|
10
|
+
from nomenklatura.matching.regression_v1.names import name_levenshtein, name_match
|
11
|
+
from nomenklatura.matching.regression_v1.names import name_token_overlap, name_numbers
|
12
|
+
from nomenklatura.matching.regression_v1.misc import phone_match, email_match
|
13
|
+
from nomenklatura.matching.regression_v1.misc import address_match, address_numbers
|
14
|
+
from nomenklatura.matching.regression_v1.misc import identifier_match, birth_place
|
15
|
+
from nomenklatura.matching.regression_v1.misc import org_identifier_match
|
16
|
+
from nomenklatura.matching.regression_v1.misc import gender_mismatch
|
17
|
+
from nomenklatura.matching.regression_v1.misc import country_mismatch
|
18
|
+
from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
|
19
|
+
from nomenklatura.matching.compare.dates import dob_year_disjoint
|
20
|
+
from nomenklatura.matching.types import (
|
21
|
+
FeatureDocs,
|
22
|
+
FeatureDoc,
|
23
|
+
MatchingResult,
|
24
|
+
ScoringConfig,
|
25
|
+
)
|
26
|
+
from nomenklatura.matching.types import CompareFunction, FtResult
|
27
|
+
from nomenklatura.matching.types import Encoded, ScoringAlgorithm
|
28
|
+
from nomenklatura.matching.util import make_github_url
|
29
|
+
from nomenklatura.util import DATA_PATH
|
30
|
+
|
31
|
+
|
32
|
+
class RegressionV1(ScoringAlgorithm):
|
33
|
+
"""A simple matching algorithm based on a regression model."""
|
34
|
+
|
35
|
+
NAME = "regression-v1"
|
36
|
+
MODEL_PATH = DATA_PATH.joinpath(f"{NAME}.pkl")
|
37
|
+
FEATURES: List[CompareFunction] = [
|
38
|
+
name_match,
|
39
|
+
name_token_overlap,
|
40
|
+
name_numbers,
|
41
|
+
name_levenshtein,
|
42
|
+
phone_match,
|
43
|
+
email_match,
|
44
|
+
identifier_match,
|
45
|
+
dob_matches,
|
46
|
+
dob_year_matches,
|
47
|
+
FtResult.unwrap(dob_year_disjoint),
|
48
|
+
first_name_match,
|
49
|
+
family_name_match,
|
50
|
+
birth_place,
|
51
|
+
gender_mismatch,
|
52
|
+
country_mismatch,
|
53
|
+
org_identifier_match,
|
54
|
+
address_match,
|
55
|
+
address_numbers,
|
56
|
+
]
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
def save(cls, pipe: Pipeline, coefficients: Dict[str, float]) -> None:
|
60
|
+
"""Store a classification pipeline after training."""
|
61
|
+
mdl = pickle.dumps({"pipe": pipe, "coefficients": coefficients})
|
62
|
+
with open(cls.MODEL_PATH, "wb") as fh:
|
63
|
+
fh.write(mdl)
|
64
|
+
cls.load.cache_clear()
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
@cache
|
68
|
+
def load(cls) -> Tuple[Pipeline, Dict[str, float]]:
|
69
|
+
"""Load a pre-trained classification pipeline for ad-hoc use."""
|
70
|
+
with open(cls.MODEL_PATH, "rb") as fh:
|
71
|
+
matcher = pickle.loads(fh.read())
|
72
|
+
pipe = cast(Pipeline, matcher["pipe"])
|
73
|
+
coefficients = cast(Dict[str, float], matcher["coefficients"])
|
74
|
+
current = [f.__name__ for f in cls.FEATURES]
|
75
|
+
if list(coefficients.keys()) != current:
|
76
|
+
raise RuntimeError("Model was not trained on identical features!")
|
77
|
+
return pipe, coefficients
|
78
|
+
|
79
|
+
@classmethod
|
80
|
+
def get_feature_docs(cls) -> FeatureDocs:
|
81
|
+
"""Return an explanation of the features and their coefficients."""
|
82
|
+
features: FeatureDocs = {}
|
83
|
+
_, coefficients = cls.load()
|
84
|
+
for func in cls.FEATURES:
|
85
|
+
name = func.__name__
|
86
|
+
features[name] = FeatureDoc(
|
87
|
+
description=func.__doc__,
|
88
|
+
coefficient=float(coefficients[name]),
|
89
|
+
url=make_github_url(func),
|
90
|
+
)
|
91
|
+
return features
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
|
95
|
+
"""Use a regression model to compare two entities."""
|
96
|
+
pipe, _ = cls.load()
|
97
|
+
encoded = cls.encode_pair(query, result)
|
98
|
+
npfeat = np.array([encoded])
|
99
|
+
pred = pipe.predict_proba(npfeat)
|
100
|
+
score = cast(float, pred[0][1])
|
101
|
+
explanations: Dict[str, FtResult] = {}
|
102
|
+
for feature, coeff in zip(cls.FEATURES, encoded):
|
103
|
+
name = feature.__name__
|
104
|
+
explanations[name] = FtResult(score=float(coeff), detail=None)
|
105
|
+
return MatchingResult.make(score=score, explanations=explanations)
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def encode_pair(cls, left: E, right: E) -> Encoded:
|
109
|
+
"""Encode the comparison between two entities as a set of feature values."""
|
110
|
+
return [f(left, right) for f in cls.FEATURES]
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from typing import Iterable, Set
|
2
|
+
from followthemoney.proxy import E
|
3
|
+
from followthemoney.types import registry
|
4
|
+
|
5
|
+
from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
|
6
|
+
from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers
|
7
|
+
from nomenklatura.matching.util import props_pair, type_pair
|
8
|
+
from nomenklatura.matching.util import max_in_sets
|
9
|
+
from nomenklatura.matching.compat import fingerprint_name
|
10
|
+
|
11
|
+
|
12
|
+
def normalize_names(raws: Iterable[str]) -> Set[str]:
|
13
|
+
names = set()
|
14
|
+
for raw in raws:
|
15
|
+
name = fingerprint_name(raw)
|
16
|
+
if name is not None:
|
17
|
+
names.add(name[:128])
|
18
|
+
return names
|
19
|
+
|
20
|
+
|
21
|
+
def name_levenshtein(left: E, right: E) -> float:
|
22
|
+
"""Consider the edit distance (as a fraction of name length) between the two most
|
23
|
+
similar names linked to both entities."""
|
24
|
+
lv, rv = type_pair(left, right, registry.name)
|
25
|
+
lvn, rvn = normalize_names(lv), normalize_names(rv)
|
26
|
+
return max_in_sets(lvn, rvn, compare_levenshtein)
|
27
|
+
|
28
|
+
|
29
|
+
def first_name_match(left: E, right: E) -> float:
|
30
|
+
"""Matching first/given name between the two entities."""
|
31
|
+
lv, rv = tokenize_pair(props_pair(left, right, ["firstName"]))
|
32
|
+
return 1.0 if has_overlap(lv, rv) else 0.0
|
33
|
+
|
34
|
+
|
35
|
+
def family_name_match(left: E, right: E) -> float:
|
36
|
+
"""Matching family name between the two entities."""
|
37
|
+
lv, rv = tokenize_pair(props_pair(left, right, ["lastName"]))
|
38
|
+
return 1.0 if has_overlap(lv, rv) else 0.0
|
39
|
+
|
40
|
+
|
41
|
+
def name_match(left: E, right: E) -> float:
|
42
|
+
"""Check for exact name matches between the two entities."""
|
43
|
+
lv, rv = type_pair(left, right, registry.name)
|
44
|
+
lvn, rvn = normalize_names(lv), normalize_names(rv)
|
45
|
+
common = [len(n) for n in lvn.intersection(rvn)]
|
46
|
+
max_common = max(common, default=0)
|
47
|
+
if max_common == 0:
|
48
|
+
return 0.0
|
49
|
+
return float(max_common)
|
50
|
+
|
51
|
+
|
52
|
+
def name_token_overlap(left: E, right: E) -> float:
|
53
|
+
"""Evaluate the proportion of identical words in each name."""
|
54
|
+
lv, rv = tokenize_pair(type_pair(left, right, registry.name))
|
55
|
+
common = lv.intersection(rv)
|
56
|
+
tokens = min(len(lv), len(rv))
|
57
|
+
return float(len(common)) / float(max(2.0, tokens))
|
58
|
+
|
59
|
+
|
60
|
+
def name_numbers(left: E, right: E) -> float:
|
61
|
+
"""Find if names contain numbers, score if the numbers are different."""
|
62
|
+
lv, rv = type_pair(left, right, registry.name)
|
63
|
+
return 1.0 if is_disjoint(extract_numbers(lv), extract_numbers(rv)) else 0.0
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import logging
|
2
|
+
import numpy as np
|
3
|
+
import multiprocessing
|
4
|
+
from typing import Iterable, List, Tuple
|
5
|
+
from pprint import pprint
|
6
|
+
from numpy.typing import NDArray
|
7
|
+
from sklearn.pipeline import make_pipeline # type: ignore
|
8
|
+
from sklearn.preprocessing import StandardScaler # type: ignore
|
9
|
+
from sklearn.model_selection import train_test_split # type: ignore
|
10
|
+
from sklearn.linear_model import LogisticRegression # type: ignore
|
11
|
+
from sklearn import metrics # type: ignore
|
12
|
+
from concurrent.futures import ThreadPoolExecutor
|
13
|
+
from followthemoney.util import PathLike
|
14
|
+
|
15
|
+
from nomenklatura.judgement import Judgement
|
16
|
+
from nomenklatura.matching.pairs import read_pairs, JudgedPair
|
17
|
+
from nomenklatura.matching.regression_v1.model import RegressionV1
|
18
|
+
|
19
|
+
log = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
def pair_convert(pair: JudgedPair) -> Tuple[List[float], int]:
|
23
|
+
"""Encode a pair of training data into features and target."""
|
24
|
+
judgement = 1 if pair.judgement == Judgement.POSITIVE else 0
|
25
|
+
features = RegressionV1.encode_pair(pair.left, pair.right)
|
26
|
+
return features, judgement
|
27
|
+
|
28
|
+
|
29
|
+
def pairs_to_arrays(
|
30
|
+
pairs: Iterable[JudgedPair],
|
31
|
+
) -> Tuple[NDArray[np.float32], NDArray[np.float32]]:
|
32
|
+
"""Parallelize feature computation for training data"""
|
33
|
+
xrows = []
|
34
|
+
yrows = []
|
35
|
+
threads = multiprocessing.cpu_count()
|
36
|
+
log.info("Compute threads: %d", threads)
|
37
|
+
with ThreadPoolExecutor(max_workers=threads) as excecutor:
|
38
|
+
results = excecutor.map(pair_convert, pairs)
|
39
|
+
for idx, (x, y) in enumerate(results):
|
40
|
+
if idx > 0 and idx % 10000 == 0:
|
41
|
+
log.info("Computing features: %s....", idx)
|
42
|
+
xrows.append(x)
|
43
|
+
yrows.append(y)
|
44
|
+
|
45
|
+
return np.array(xrows), np.array(yrows)
|
46
|
+
|
47
|
+
|
48
|
+
def train_matcher(pairs_file: PathLike) -> None:
|
49
|
+
pairs = []
|
50
|
+
for pair in read_pairs(pairs_file):
|
51
|
+
# HACK: support more eventually:
|
52
|
+
# if not pair.left.schema.is_a("LegalEntity"):
|
53
|
+
# continue
|
54
|
+
if pair.judgement == Judgement.UNSURE:
|
55
|
+
pair.judgement = Judgement.NEGATIVE
|
56
|
+
# randomize_entity(pair.left)
|
57
|
+
# randomize_entity(pair.right)
|
58
|
+
pairs.append(pair)
|
59
|
+
# random.shuffle(pairs)
|
60
|
+
# pairs = pairs[:30000]
|
61
|
+
positive = len([p for p in pairs if p.judgement == Judgement.POSITIVE])
|
62
|
+
negative = len([p for p in pairs if p.judgement == Judgement.NEGATIVE])
|
63
|
+
log.info("Total pairs loaded: %d (%d pos/%d neg)", len(pairs), positive, negative)
|
64
|
+
X, y = pairs_to_arrays(pairs)
|
65
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
|
66
|
+
# logreg = LogisticRegression(class_weight={0: 95, 1: 1})
|
67
|
+
# logreg = LogisticRegression(penalty="l1", solver="liblinear")
|
68
|
+
logreg = LogisticRegression(penalty="l2")
|
69
|
+
log.info("Training model...")
|
70
|
+
pipe = make_pipeline(StandardScaler(), logreg)
|
71
|
+
pipe.fit(X_train, y_train)
|
72
|
+
coef = logreg.coef_[0]
|
73
|
+
coefficients = {n.__name__: c for n, c in zip(RegressionV1.FEATURES, coef)}
|
74
|
+
RegressionV1.save(pipe, coefficients)
|
75
|
+
print("Written to: %s" % RegressionV1.MODEL_PATH.as_posix())
|
76
|
+
print("Coefficients:")
|
77
|
+
pprint(coefficients)
|
78
|
+
y_pred = pipe.predict(X_test)
|
79
|
+
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
|
80
|
+
print("Confusion matrix:\n", cnf_matrix)
|
81
|
+
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
82
|
+
print("Precision:", metrics.precision_score(y_test, y_pred))
|
83
|
+
print("Recall:", metrics.recall_score(y_test, y_pred))
|
84
|
+
|
85
|
+
y_pred_proba = pipe.predict_proba(X_test)[::, 1]
|
86
|
+
auc = metrics.roc_auc_score(y_test, y_pred_proba)
|
87
|
+
print("Area under curve:", auc)
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from normality.constants import WS
|
2
|
+
from typing import Iterable, Set, Tuple
|
3
|
+
from rigour.text.distance import levenshtein
|
4
|
+
|
5
|
+
from nomenklatura.matching.compat import clean_name_ascii
|
6
|
+
|
7
|
+
|
8
|
+
def tokenize(texts: Iterable[str]) -> Set[str]:
|
9
|
+
tokens: Set[str] = set()
|
10
|
+
for text in texts:
|
11
|
+
cleaned = clean_name_ascii(text)
|
12
|
+
if cleaned is None:
|
13
|
+
continue
|
14
|
+
for token in cleaned.split(WS):
|
15
|
+
token = token.strip()
|
16
|
+
if len(token) > 2:
|
17
|
+
tokens.add(token)
|
18
|
+
return tokens
|
19
|
+
|
20
|
+
|
21
|
+
def tokenize_pair(
|
22
|
+
pair: Tuple[Iterable[str], Iterable[str]],
|
23
|
+
) -> Tuple[Set[str], Set[str]]:
|
24
|
+
return tokenize(pair[0]), tokenize(pair[1])
|
25
|
+
|
26
|
+
|
27
|
+
def compare_levenshtein(left: str, right: str) -> float:
|
28
|
+
distance = levenshtein(left, right)
|
29
|
+
base = max((1, len(left), len(right)))
|
30
|
+
return 1.0 - (distance / float(base))
|
31
|
+
# return math.sqrt(distance)
|