nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
# This module buffers out some of the fingerprints package in anticipation of
|
2
|
+
# a future removal of the package. All of the functionality is now contained in
|
3
|
+
# rigour, but the different functioning of both could lead to unexpected results.
|
4
|
+
# This module is a temporary solution to allow for a smooth transition.
|
5
|
+
import logging
|
6
|
+
from typing import Iterable, List, Optional
|
7
|
+
from functools import lru_cache
|
8
|
+
from normality import squash_spaces
|
9
|
+
from normality.constants import WS
|
10
|
+
from rigour.names import remove_person_prefixes
|
11
|
+
from fingerprints.cleanup import clean_name_ascii, clean_name_light
|
12
|
+
from fingerprints.types import replace_types
|
13
|
+
|
14
|
+
log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"fingerprint_name",
|
18
|
+
"clean_name_ascii",
|
19
|
+
"clean_name_light",
|
20
|
+
"names_word_list",
|
21
|
+
"name_words",
|
22
|
+
]
|
23
|
+
|
24
|
+
|
25
|
+
@lru_cache(maxsize=1024)
|
26
|
+
def fingerprint_name(original: str) -> Optional[str]:
|
27
|
+
"""Fingerprint a legal entity name."""
|
28
|
+
# this needs to happen before the replacements
|
29
|
+
text = original.lower()
|
30
|
+
text = remove_person_prefixes(text)
|
31
|
+
# Super hard-core string scrubbing
|
32
|
+
cleaned = clean_name_ascii(text)
|
33
|
+
if cleaned is None:
|
34
|
+
return None
|
35
|
+
cleaned = replace_types(cleaned)
|
36
|
+
cleaned = squash_spaces(cleaned)
|
37
|
+
if len(cleaned) < 1:
|
38
|
+
return None
|
39
|
+
return cleaned
|
40
|
+
|
41
|
+
|
42
|
+
def names_word_list(
|
43
|
+
names: Iterable[str],
|
44
|
+
min_length: int = 1,
|
45
|
+
) -> List[str]:
|
46
|
+
"""Get a list of tokens present in the given set of names."""
|
47
|
+
words: List[str] = []
|
48
|
+
for name in names:
|
49
|
+
normalized = fingerprint_name(name)
|
50
|
+
if normalized is None:
|
51
|
+
continue
|
52
|
+
for word in normalized.split(WS):
|
53
|
+
if len(word) >= min_length:
|
54
|
+
words.append(word)
|
55
|
+
return words
|
56
|
+
|
57
|
+
|
58
|
+
def name_words(name: Optional[str], min_length: int = 1) -> List[str]:
|
59
|
+
"""Get a list of tokens present in the given name."""
|
60
|
+
if name is None:
|
61
|
+
return []
|
62
|
+
words: List[str] = []
|
63
|
+
for word in name.split(WS):
|
64
|
+
if len(word) >= min_length:
|
65
|
+
words.append(word)
|
66
|
+
return words
|
File without changes
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from followthemoney import registry, E
|
2
|
+
|
3
|
+
from nomenklatura.matching.util import type_pair, has_schema
|
4
|
+
|
5
|
+
|
6
|
+
# def obj_country(left: E, right: E) -> float:
|
7
|
+
# """Check if two entities share a country."""
|
8
|
+
# if has_schema(left, right, "LegalEntity"):
|
9
|
+
# return 0.0
|
10
|
+
# lv, rv = type_pair(left, right, registry.country)
|
11
|
+
# if len(lv) == 0 or len(rv) == 0:
|
12
|
+
# return 0.0
|
13
|
+
# common = len(set(lv).intersection(rv))
|
14
|
+
# return 1.0 if common > 0 else -1.0
|
15
|
+
# # if common == 0:
|
16
|
+
# # return -1.0
|
17
|
+
# # total = len(lv) + len(rv)
|
18
|
+
# # return float(common) / total
|
19
|
+
|
20
|
+
|
21
|
+
def org_obj_country_match(left: E, right: E) -> float:
|
22
|
+
"""Check if two entities share a country."""
|
23
|
+
if has_schema(left, right, "LegalEntity") and not has_schema(
|
24
|
+
left, right, "Organization"
|
25
|
+
):
|
26
|
+
return 0.0
|
27
|
+
lv, rv = type_pair(left, right, registry.country)
|
28
|
+
if len(lv) == 0 or len(rv) == 0:
|
29
|
+
return 0.0
|
30
|
+
common = len(set(lv).intersection(rv))
|
31
|
+
return 1.0 if common > 0 else -1.0
|
32
|
+
|
33
|
+
|
34
|
+
def per_country_mismatch(left: E, right: E) -> float:
|
35
|
+
"""Both persons are linked to different countries."""
|
36
|
+
if not has_schema(left, right, "Person"):
|
37
|
+
return 0.0
|
38
|
+
qv, rv = type_pair(left, right, registry.country)
|
39
|
+
if len(qv) == 0 or len(rv) == 0:
|
40
|
+
return 0.0
|
41
|
+
overlap = len(set(qv).intersection(rv))
|
42
|
+
return 1.0 if overlap == 0 else -0.2
|
@@ -0,0 +1,64 @@
|
|
1
|
+
from typing import Set, Tuple
|
2
|
+
from rigour.ids import get_strong_format_names
|
3
|
+
|
4
|
+
from followthemoney import EntityProxy, registry
|
5
|
+
|
6
|
+
|
7
|
+
HONORARY_STRONG = {registry.phone, registry.email, registry.checksum}
|
8
|
+
STRONG_FORMATS = get_strong_format_names()
|
9
|
+
|
10
|
+
|
11
|
+
def _get_strong_identifiers(entity: EntityProxy) -> Set[Tuple[str, str]]:
|
12
|
+
strong_ids: Set[Tuple[str, str]] = set()
|
13
|
+
for prop, value in entity.itervalues():
|
14
|
+
if not prop.matchable:
|
15
|
+
continue
|
16
|
+
if prop.format in STRONG_FORMATS:
|
17
|
+
strong_ids.add((prop.format, value))
|
18
|
+
elif prop.type in HONORARY_STRONG:
|
19
|
+
strong_ids.add((prop.name, value))
|
20
|
+
return strong_ids
|
21
|
+
|
22
|
+
|
23
|
+
def _get_weak_identifiers(entity: EntityProxy) -> Set[str]:
|
24
|
+
weak_ids: Set[str] = set()
|
25
|
+
for prop, value in entity.itervalues():
|
26
|
+
if not prop.matchable or not prop.type != registry.identifier:
|
27
|
+
continue
|
28
|
+
if prop.format in STRONG_FORMATS:
|
29
|
+
continue
|
30
|
+
weak_ids.add(value)
|
31
|
+
return weak_ids
|
32
|
+
|
33
|
+
|
34
|
+
def strong_identifier_match(left: EntityProxy, right: EntityProxy) -> float:
|
35
|
+
"""Check if two entities share any strong identifiers."""
|
36
|
+
left_strong = _get_strong_identifiers(left)
|
37
|
+
right_strong = _get_strong_identifiers(right)
|
38
|
+
if len(left_strong) == 0 or len(right_strong) == 0:
|
39
|
+
return 0.0
|
40
|
+
if left_strong.intersection(right_strong):
|
41
|
+
return 1.0
|
42
|
+
left_nofmt = {v for _, v in left_strong}
|
43
|
+
right_nofmt = {v for _, v in right_strong}
|
44
|
+
if left_nofmt.intersection(_get_weak_identifiers(right)):
|
45
|
+
return 0.7
|
46
|
+
if right_nofmt.intersection(_get_weak_identifiers(left)):
|
47
|
+
return 0.7
|
48
|
+
left_fmts = {f for _, f in left_strong}
|
49
|
+
right_fmts = {f for _, f in right_strong}
|
50
|
+
common_fmts = left_fmts.intersection(right_fmts)
|
51
|
+
return -0.2 * len(common_fmts)
|
52
|
+
|
53
|
+
|
54
|
+
def weak_identifier_match(left: EntityProxy, right: EntityProxy) -> float:
|
55
|
+
"""Check if two entities share any weak identifiers."""
|
56
|
+
left_ids = _get_weak_identifiers(left)
|
57
|
+
right_ids = _get_weak_identifiers(right)
|
58
|
+
if left_ids.intersection(right_ids):
|
59
|
+
return 1.0
|
60
|
+
# left_formats = {fmt for fmt, _ in left_ids}
|
61
|
+
# right_formats = {fmt for fmt, _ in right_ids}
|
62
|
+
# if left_formats.intersection(right_formats):
|
63
|
+
# return -0.5
|
64
|
+
return 0.0
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from typing import List, Optional, Set
|
2
|
+
from followthemoney import registry, E
|
3
|
+
|
4
|
+
from nomenklatura.matching.compare.util import extract_numbers
|
5
|
+
from nomenklatura.matching.util import type_pair
|
6
|
+
from nomenklatura.matching.util import has_schema
|
7
|
+
|
8
|
+
from rigour.addresses import normalize_address, shorten_address_keywords
|
9
|
+
|
10
|
+
OTHER = registry.gender.OTHER
|
11
|
+
|
12
|
+
|
13
|
+
def _norm_address(addr: str, latinize: bool = True) -> Optional[str]:
|
14
|
+
norm_addr = normalize_address(addr, latinize=latinize, min_length=4)
|
15
|
+
if norm_addr is not None:
|
16
|
+
norm_addr = shorten_address_keywords(norm_addr, latinize=latinize)
|
17
|
+
return norm_addr
|
18
|
+
|
19
|
+
|
20
|
+
def _norm_place(places: List[str]) -> Set[str]:
|
21
|
+
parts = set()
|
22
|
+
for place in places:
|
23
|
+
norm_place = _norm_address(place)
|
24
|
+
if norm_place is not None:
|
25
|
+
for part in norm_place.split(" "):
|
26
|
+
parts.add(part)
|
27
|
+
return parts
|
28
|
+
|
29
|
+
|
30
|
+
def birth_place(query: E, result: E) -> float:
|
31
|
+
"""Same place of birth."""
|
32
|
+
if not has_schema(query, result, "Person"):
|
33
|
+
return 0.0
|
34
|
+
lparts = _norm_place(query.get("birthPlace", quiet=True))
|
35
|
+
rparts = _norm_place(result.get("birthPlace", quiet=True))
|
36
|
+
overlap = len(lparts.intersection(rparts))
|
37
|
+
base_length = max(1.0, min(len(lparts), len(rparts)))
|
38
|
+
return overlap / base_length
|
39
|
+
|
40
|
+
|
41
|
+
def address_match(query: E, result: E) -> float:
|
42
|
+
"""Text similarity between addresses."""
|
43
|
+
lv, rv = type_pair(query, result, registry.address)
|
44
|
+
lvn = _norm_place(lv)
|
45
|
+
rvn = _norm_place(rv)
|
46
|
+
if len(lvn) == 0 or len(rvn) == 0:
|
47
|
+
return 0.0
|
48
|
+
overlap = len(lvn.intersection(rvn))
|
49
|
+
tokens = max(1.0, min(len(lvn), len(rvn)))
|
50
|
+
if overlap == 0:
|
51
|
+
return 0.0
|
52
|
+
return float(overlap) / float(tokens)
|
53
|
+
|
54
|
+
|
55
|
+
def address_numbers(query: E, result: E) -> float:
|
56
|
+
"""Find if names contain numbers, score if the numbers are different."""
|
57
|
+
lv, rv = type_pair(query, result, registry.address)
|
58
|
+
lvn = extract_numbers(lv)
|
59
|
+
rvn = extract_numbers(rv)
|
60
|
+
common = len(lvn.intersection(rvn))
|
61
|
+
disjoint = len(lvn.difference(rvn))
|
62
|
+
return common - disjoint
|
63
|
+
|
64
|
+
|
65
|
+
def gender_mismatch(query: E, result: E) -> float:
|
66
|
+
"""Both entities have a different gender associated with them."""
|
67
|
+
qv = {v for v in query.get("gender", quiet=True) if v != OTHER}
|
68
|
+
rv = {v for v in result.get("gender", quiet=True) if v != OTHER}
|
69
|
+
if len(qv) == 1 and len(rv) == 1 and len(qv.intersection(rv)) == 0:
|
70
|
+
return 1.0
|
71
|
+
return 0.0
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import pickle
|
2
|
+
import numpy as np
|
3
|
+
from typing import List, Dict, Tuple, cast
|
4
|
+
from functools import cache
|
5
|
+
from sklearn.pipeline import Pipeline # type: ignore
|
6
|
+
from followthemoney import E
|
7
|
+
|
8
|
+
from nomenklatura.matching.erun.names import name_levenshtein, family_name_match
|
9
|
+
from nomenklatura.matching.erun.names import name_token_overlap, name_numbers
|
10
|
+
from nomenklatura.matching.erun.names import obj_name_levenshtein
|
11
|
+
from nomenklatura.matching.erun.misc import address_match, address_numbers
|
12
|
+
from nomenklatura.matching.erun.misc import birth_place
|
13
|
+
from nomenklatura.matching.erun.misc import gender_mismatch
|
14
|
+
from nomenklatura.matching.erun.countries import (
|
15
|
+
org_obj_country_match,
|
16
|
+
per_country_mismatch,
|
17
|
+
)
|
18
|
+
from nomenklatura.matching.erun.identifiers import strong_identifier_match
|
19
|
+
from nomenklatura.matching.erun.identifiers import weak_identifier_match
|
20
|
+
from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
|
21
|
+
from nomenklatura.matching.compare.dates import dob_year_disjoint
|
22
|
+
from nomenklatura.matching.types import (
|
23
|
+
FeatureDocs,
|
24
|
+
FeatureDoc,
|
25
|
+
MatchingResult,
|
26
|
+
ScoringConfig,
|
27
|
+
)
|
28
|
+
from nomenklatura.matching.types import CompareFunction, FtResult
|
29
|
+
from nomenklatura.matching.types import Encoded, ScoringAlgorithm
|
30
|
+
from nomenklatura.matching.util import make_github_url
|
31
|
+
from nomenklatura.util import DATA_PATH
|
32
|
+
|
33
|
+
|
34
|
+
class EntityResolveRegression(ScoringAlgorithm):
|
35
|
+
"""Entity resolution matcher. Do not use this in (regulated) screening scenarios."""
|
36
|
+
|
37
|
+
NAME = "er-unstable"
|
38
|
+
MODEL_PATH = DATA_PATH.joinpath(f"{NAME}.pkl")
|
39
|
+
FEATURES: List[CompareFunction] = [
|
40
|
+
name_token_overlap,
|
41
|
+
name_numbers,
|
42
|
+
name_levenshtein,
|
43
|
+
strong_identifier_match,
|
44
|
+
weak_identifier_match,
|
45
|
+
dob_matches,
|
46
|
+
dob_year_matches,
|
47
|
+
FtResult.unwrap(dob_year_disjoint),
|
48
|
+
family_name_match,
|
49
|
+
birth_place,
|
50
|
+
gender_mismatch,
|
51
|
+
per_country_mismatch,
|
52
|
+
org_obj_country_match,
|
53
|
+
obj_name_levenshtein,
|
54
|
+
address_match,
|
55
|
+
address_numbers,
|
56
|
+
]
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
def save(cls, pipe: Pipeline, coefficients: Dict[str, float]) -> None:
|
60
|
+
"""Store a classification pipeline after training."""
|
61
|
+
mdl = pickle.dumps({"pipe": pipe, "coefficients": coefficients})
|
62
|
+
with open(cls.MODEL_PATH, "wb") as fh:
|
63
|
+
fh.write(mdl)
|
64
|
+
cls.load.cache_clear()
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
@cache
|
68
|
+
def load(cls) -> Tuple[Pipeline, Dict[str, float]]:
|
69
|
+
"""Load a pre-trained classification pipeline for ad-hoc use."""
|
70
|
+
with open(cls.MODEL_PATH, "rb") as fh:
|
71
|
+
matcher = pickle.loads(fh.read())
|
72
|
+
pipe = cast(Pipeline, matcher["pipe"])
|
73
|
+
coefficients = cast(Dict[str, float], matcher["coefficients"])
|
74
|
+
current = [f.__name__ for f in cls.FEATURES]
|
75
|
+
if list(coefficients.keys()) != current:
|
76
|
+
raise RuntimeError("Model was not trained on identical features!")
|
77
|
+
return pipe, coefficients
|
78
|
+
|
79
|
+
@classmethod
|
80
|
+
def get_feature_docs(cls) -> FeatureDocs:
|
81
|
+
"""Return an explanation of the features and their coefficients."""
|
82
|
+
features: FeatureDocs = {}
|
83
|
+
_, coefficients = cls.load()
|
84
|
+
for func in cls.FEATURES:
|
85
|
+
name = func.__name__
|
86
|
+
features[name] = FeatureDoc(
|
87
|
+
description=func.__doc__,
|
88
|
+
coefficient=float(coefficients[name]),
|
89
|
+
url=make_github_url(func),
|
90
|
+
)
|
91
|
+
return features
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
|
95
|
+
"""Use a regression model to compare two entities."""
|
96
|
+
pipe, _ = cls.load()
|
97
|
+
encoded = cls.encode_pair(query, result)
|
98
|
+
npfeat = np.array([encoded])
|
99
|
+
pred = pipe.predict_proba(npfeat)
|
100
|
+
score = cast(float, pred[0][1])
|
101
|
+
explanations: Dict[str, FtResult] = {}
|
102
|
+
for feature, coeff in zip(cls.FEATURES, encoded):
|
103
|
+
name = feature.__name__
|
104
|
+
explanations[name] = FtResult(score=float(coeff), detail=None)
|
105
|
+
return MatchingResult.make(score=score, explanations=explanations)
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def encode_pair(cls, left: E, right: E) -> Encoded:
|
109
|
+
"""Encode the comparison between two entities as a set of feature values."""
|
110
|
+
return [f(left, right) for f in cls.FEATURES]
|
@@ -0,0 +1,126 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
from typing import Set
|
3
|
+
from followthemoney import EntityProxy, registry, E
|
4
|
+
from followthemoney.names import schema_type_tag
|
5
|
+
from rigour.text.distance import levenshtein_similarity
|
6
|
+
from rigour.names import Name, NameTypeTag
|
7
|
+
from rigour.names import is_stopword
|
8
|
+
from rigour.names import remove_org_prefixes, remove_obj_prefixes
|
9
|
+
from rigour.names import remove_person_prefixes
|
10
|
+
from rigour.names import replace_org_types_compare
|
11
|
+
|
12
|
+
from nomenklatura.matching.erun.util import compare_levenshtein
|
13
|
+
from nomenklatura.matching.util import max_in_sets, has_schema
|
14
|
+
from nomenklatura.util import unroll
|
15
|
+
|
16
|
+
|
17
|
+
@lru_cache(maxsize=512)
|
18
|
+
def _entity_names(entity: EntityProxy) -> Set[Name]:
|
19
|
+
names: Set[Name] = set()
|
20
|
+
tag = schema_type_tag(entity.schema)
|
21
|
+
for string in entity.get_type_values(registry.name, matchable=True):
|
22
|
+
if tag in (NameTypeTag.ORG, NameTypeTag.ENT):
|
23
|
+
string = replace_org_types_compare(string)
|
24
|
+
string = remove_org_prefixes(string)
|
25
|
+
elif tag == NameTypeTag.PER:
|
26
|
+
string = remove_person_prefixes(string)
|
27
|
+
else:
|
28
|
+
string = remove_obj_prefixes(string)
|
29
|
+
n = Name(string, tag=tag)
|
30
|
+
names.add(n)
|
31
|
+
return names
|
32
|
+
|
33
|
+
|
34
|
+
def name_levenshtein(left: E, right: E) -> float:
|
35
|
+
"""Consider the edit distance (as a fraction of name length) between the two most
|
36
|
+
similar names linked to both entities."""
|
37
|
+
if not has_schema(left, right, "LegalEntity"):
|
38
|
+
return 0.0
|
39
|
+
if has_schema(left, right, "Person"):
|
40
|
+
left_names: Set[str] = set()
|
41
|
+
for name in _entity_names(left):
|
42
|
+
left_names.add(" ".join(sorted(part.comparable for part in name.parts)))
|
43
|
+
left_names.add(name.comparable)
|
44
|
+
right_names: Set[str] = set()
|
45
|
+
for name in _entity_names(right):
|
46
|
+
right_names.add(" ".join(sorted(part.comparable for part in name.parts)))
|
47
|
+
right_names.add(name.comparable)
|
48
|
+
else:
|
49
|
+
left_names = {n.comparable for n in _entity_names(left)}
|
50
|
+
right_names = {n.comparable for n in _entity_names(right)}
|
51
|
+
return max_in_sets(left_names, right_names, compare_levenshtein)
|
52
|
+
|
53
|
+
|
54
|
+
def _entity_lastnames(entity: EntityProxy) -> Set[str]:
|
55
|
+
names: Set[str] = set()
|
56
|
+
for string in entity.get("lastName", quiet=True):
|
57
|
+
n = Name(string, tag=NameTypeTag.PER)
|
58
|
+
for part in n.parts:
|
59
|
+
if len(part.comparable) > 2 and not is_stopword(part.form):
|
60
|
+
names.add(part.comparable)
|
61
|
+
return names
|
62
|
+
|
63
|
+
|
64
|
+
def family_name_match(left: E, right: E) -> float:
|
65
|
+
"""Matching family name between the two entities."""
|
66
|
+
if not has_schema(left, right, "Person"):
|
67
|
+
return 0.0
|
68
|
+
lnames = _entity_lastnames(left)
|
69
|
+
rnames = _entity_lastnames(right)
|
70
|
+
if len(lnames) == 0 or len(rnames) == 0:
|
71
|
+
return 0.0
|
72
|
+
overlap = lnames.intersection(rnames)
|
73
|
+
return -1.0 if len(overlap) == 0 else 1.0
|
74
|
+
|
75
|
+
|
76
|
+
def _name_tokens(entity: EntityProxy) -> Set[str]:
|
77
|
+
tokens: Set[str] = set()
|
78
|
+
for name in _entity_names(entity):
|
79
|
+
for part in name.parts:
|
80
|
+
cmp = part.comparable
|
81
|
+
if len(cmp) > 2 and not is_stopword(part.form):
|
82
|
+
tokens.add(cmp)
|
83
|
+
return tokens
|
84
|
+
|
85
|
+
|
86
|
+
def name_token_overlap(left: E, right: E) -> float:
|
87
|
+
"""Evaluate the proportion of identical words in each name."""
|
88
|
+
left_tokens = _name_tokens(left)
|
89
|
+
right_tokens = _name_tokens(right)
|
90
|
+
common = left_tokens.intersection(right_tokens)
|
91
|
+
tokens = min(len(left_tokens), len(right_tokens))
|
92
|
+
return float(len(common)) / float(max(2.0, tokens))
|
93
|
+
|
94
|
+
|
95
|
+
def name_numbers(left: E, right: E) -> float:
|
96
|
+
"""Find if names contain numbers, score if the numbers are different."""
|
97
|
+
left_names = [n.parts for n in _entity_names(left)]
|
98
|
+
right_names = [n.parts for n in _entity_names(right)]
|
99
|
+
left_numbers = {p.comparable for p in unroll(left_names) if p.numeric}
|
100
|
+
right_numbers = {p.comparable for p in unroll(right_names) if p.numeric}
|
101
|
+
total = len(left_numbers) + len(right_numbers)
|
102
|
+
if total == 0:
|
103
|
+
return 0.0
|
104
|
+
common = len(left_numbers.intersection(right_numbers))
|
105
|
+
if common == 0 and len(left_numbers) > 0 and len(right_numbers) > 0:
|
106
|
+
# If both names contain numbers, but they are different, this is a strong
|
107
|
+
# signal that the names are not the same.
|
108
|
+
return -1.0
|
109
|
+
return common / float(total)
|
110
|
+
|
111
|
+
|
112
|
+
def _compare_strict_levenshtein(left: str, right: str) -> float:
|
113
|
+
"""A stricter version of levenshtein that returns 0.0 if the names are too
|
114
|
+
different in length."""
|
115
|
+
max_edits = min(2, max(len(left), len(right)) // 4)
|
116
|
+
score = levenshtein_similarity(left, right, max_edits=max_edits)
|
117
|
+
return score**2
|
118
|
+
|
119
|
+
|
120
|
+
def obj_name_levenshtein(left: E, right: E) -> float:
|
121
|
+
"""Very strict name comparison on object (Vessel, RealEstate, Security) names."""
|
122
|
+
if has_schema(left, right, "LegalEntity"):
|
123
|
+
return 0.0
|
124
|
+
left_names = {n.comparable for n in _entity_names(left)}
|
125
|
+
right_names = {n.comparable for n in _entity_names(right)}
|
126
|
+
return max_in_sets(left_names, right_names, _compare_strict_levenshtein)
|
@@ -0,0 +1,135 @@
|
|
1
|
+
import logging
|
2
|
+
import multiprocessing
|
3
|
+
import random
|
4
|
+
from concurrent.futures import ProcessPoolExecutor
|
5
|
+
from pprint import pprint
|
6
|
+
from typing import Iterable, List, Tuple
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
from followthemoney import registry, EntityProxy
|
10
|
+
from followthemoney.util import PathLike
|
11
|
+
from numpy.typing import NDArray
|
12
|
+
from sklearn import metrics # type: ignore
|
13
|
+
from sklearn.linear_model import LogisticRegression # type: ignore
|
14
|
+
from sklearn.model_selection import train_test_split # type: ignore
|
15
|
+
from sklearn.pipeline import make_pipeline # type: ignore
|
16
|
+
from sklearn.preprocessing import StandardScaler # type: ignore
|
17
|
+
|
18
|
+
from nomenklatura.judgement import Judgement
|
19
|
+
from nomenklatura.matching.erun.model import EntityResolveRegression
|
20
|
+
from nomenklatura.matching.pairs import JudgedPair, read_pairs
|
21
|
+
|
22
|
+
log = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
def pair_convert(pair: JudgedPair) -> Tuple[List[float], int]:
|
26
|
+
"""Encode a pair of training data into features and target."""
|
27
|
+
judgement = 1 if pair.judgement == Judgement.POSITIVE else 0
|
28
|
+
features = EntityResolveRegression.encode_pair(pair.left, pair.right)
|
29
|
+
return features, judgement
|
30
|
+
|
31
|
+
|
32
|
+
def pairs_to_arrays(
|
33
|
+
pairs: Iterable[JudgedPair],
|
34
|
+
) -> Tuple[NDArray[np.float32], NDArray[np.float32]]:
|
35
|
+
"""Parallelize feature computation for training data"""
|
36
|
+
xrows = []
|
37
|
+
yrows = []
|
38
|
+
threads = multiprocessing.cpu_count()
|
39
|
+
log.info("Compute threads: %d", threads)
|
40
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
41
|
+
results = executor.map(pair_convert, pairs, chunksize=1000)
|
42
|
+
for idx, (x, y) in enumerate(results):
|
43
|
+
if idx > 0 and idx % 10000 == 0:
|
44
|
+
log.info("Computing features: %s....", idx)
|
45
|
+
xrows.append(x)
|
46
|
+
yrows.append(y)
|
47
|
+
|
48
|
+
return np.array(xrows), np.array(yrows)
|
49
|
+
|
50
|
+
|
51
|
+
def _entity_weight(entity: EntityProxy) -> float:
|
52
|
+
"""This weights up entities with more matchable properties, to push down the
|
53
|
+
value of name-only matches."""
|
54
|
+
weight = 0.0
|
55
|
+
# types = set()
|
56
|
+
for prop, _ in entity.itervalues():
|
57
|
+
if prop.matchable:
|
58
|
+
inc_weight = 0.2 if prop.type == registry.name else 1.0
|
59
|
+
weight += inc_weight
|
60
|
+
# types.add(prop.type)
|
61
|
+
# if entity.schema.is_a("LegalEntity") and types == {registry.name}:
|
62
|
+
# weight = weight * 0.5
|
63
|
+
return weight
|
64
|
+
|
65
|
+
|
66
|
+
def weighted_pair_sort(pairs: List[JudgedPair]) -> List[JudgedPair]:
|
67
|
+
for pair in pairs:
|
68
|
+
left_weight = _entity_weight(pair.left)
|
69
|
+
right_weight = _entity_weight(pair.right)
|
70
|
+
# pair.weight = (left_weight + right_weight) / 2.0
|
71
|
+
pair.weight = min(left_weight, right_weight)
|
72
|
+
return sorted(pairs, key=lambda p: -p.weight)
|
73
|
+
|
74
|
+
|
75
|
+
def build_dataset(
|
76
|
+
pairs_file: PathLike,
|
77
|
+
) -> Tuple[NDArray[np.float32], NDArray[np.float32]]:
|
78
|
+
"""Load and balance a dataset from a JSON file."""
|
79
|
+
pairs = []
|
80
|
+
for pair in read_pairs(pairs_file):
|
81
|
+
if not pair.left.schema.matchable or not pair.right.schema.matchable:
|
82
|
+
continue
|
83
|
+
if pair.left.schema.is_a("Address") or pair.right.schema.is_a("Address"):
|
84
|
+
continue
|
85
|
+
if pair.judgement == Judgement.UNSURE:
|
86
|
+
pair.judgement = Judgement.NEGATIVE
|
87
|
+
pairs.append(pair)
|
88
|
+
positive = [p for p in pairs if p.judgement == Judgement.POSITIVE]
|
89
|
+
negative = [p for p in pairs if p.judgement == Judgement.NEGATIVE]
|
90
|
+
log.info(
|
91
|
+
"Total pairs loaded: %d (%d pos/%d neg)",
|
92
|
+
len(pairs),
|
93
|
+
len(positive),
|
94
|
+
len(negative),
|
95
|
+
)
|
96
|
+
min_class = min(len(positive), len(negative))
|
97
|
+
log.info("Downsampling to %d per class", min_class)
|
98
|
+
if len(positive) > min_class:
|
99
|
+
positive = weighted_pair_sort(positive)
|
100
|
+
pairs = positive[:min_class] + negative
|
101
|
+
else:
|
102
|
+
negative = weighted_pair_sort(negative)
|
103
|
+
pairs = positive + negative[:min_class]
|
104
|
+
random.shuffle(pairs)
|
105
|
+
log.info("Training pairs after downsampling: %d", len(pairs))
|
106
|
+
return pairs_to_arrays(pairs)
|
107
|
+
|
108
|
+
|
109
|
+
def train_matcher(pairs_file: PathLike) -> None:
|
110
|
+
X, y = build_dataset(pairs_file)
|
111
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
|
112
|
+
# logreg = LogisticRegression(class_weight={0: 95, 1: 1})
|
113
|
+
# logreg = LogisticRegression(penalty="l1", solver="liblinear")
|
114
|
+
logreg = LogisticRegression(penalty="l2")
|
115
|
+
log.info("Training model...")
|
116
|
+
pipe = make_pipeline(StandardScaler(), logreg)
|
117
|
+
pipe.fit(X_train, y_train)
|
118
|
+
coef = logreg.coef_[0]
|
119
|
+
coefficients = {
|
120
|
+
n.__name__: c for n, c in zip(EntityResolveRegression.FEATURES, coef)
|
121
|
+
}
|
122
|
+
EntityResolveRegression.save(pipe, coefficients)
|
123
|
+
print("Written to: %s" % EntityResolveRegression.MODEL_PATH.as_posix())
|
124
|
+
print("Coefficients:")
|
125
|
+
pprint(coefficients)
|
126
|
+
y_pred = pipe.predict(X_test)
|
127
|
+
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
|
128
|
+
print("Confusion matrix:\n", cnf_matrix)
|
129
|
+
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
|
130
|
+
print("Precision:", metrics.precision_score(y_test, y_pred))
|
131
|
+
print("Recall:", metrics.recall_score(y_test, y_pred))
|
132
|
+
|
133
|
+
y_pred_proba = pipe.predict_proba(X_test)[::, 1]
|
134
|
+
auc = metrics.roc_auc_score(y_test, y_pred_proba)
|
135
|
+
print("Area under curve:", auc)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from normality import ascii_text
|
2
|
+
from typing import Iterable, Set, Tuple
|
3
|
+
from rigour.text.distance import levenshtein
|
4
|
+
from rigour.names import tokenize_name
|
5
|
+
|
6
|
+
|
7
|
+
def tokenize(texts: Iterable[str]) -> Set[str]:
|
8
|
+
tokens: Set[str] = set()
|
9
|
+
for text in texts:
|
10
|
+
text = text.casefold()
|
11
|
+
for token in tokenize_name(text):
|
12
|
+
ascii_token = ascii_text(token)
|
13
|
+
if ascii_token is not None and len(ascii_token) > 2:
|
14
|
+
tokens.add(ascii_token)
|
15
|
+
return tokens
|
16
|
+
|
17
|
+
|
18
|
+
def tokenize_pair(
|
19
|
+
pair: Tuple[Iterable[str], Iterable[str]],
|
20
|
+
) -> Tuple[Set[str], Set[str]]:
|
21
|
+
return tokenize(pair[0]), tokenize(pair[1])
|
22
|
+
|
23
|
+
|
24
|
+
def compare_levenshtein(left: str, right: str) -> float:
|
25
|
+
distance = levenshtein(left, right)
|
26
|
+
base = max((1, len(left), len(right)))
|
27
|
+
return 1.0 - (distance / float(base))
|
28
|
+
# return math.sqrt(distance)
|
File without changes
|