nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,66 @@
1
+ # This module buffers out some of the fingerprints package in anticipation of
2
+ # a future removal of the package. All of the functionality is now contained in
3
+ # rigour, but the different functioning of both could lead to unexpected results.
4
+ # This module is a temporary solution to allow for a smooth transition.
5
+ import logging
6
+ from typing import Iterable, List, Optional
7
+ from functools import lru_cache
8
+ from normality import squash_spaces
9
+ from normality.constants import WS
10
+ from rigour.names import remove_person_prefixes
11
+ from fingerprints.cleanup import clean_name_ascii, clean_name_light
12
+ from fingerprints.types import replace_types
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+ __all__ = [
17
+ "fingerprint_name",
18
+ "clean_name_ascii",
19
+ "clean_name_light",
20
+ "names_word_list",
21
+ "name_words",
22
+ ]
23
+
24
+
25
+ @lru_cache(maxsize=1024)
26
+ def fingerprint_name(original: str) -> Optional[str]:
27
+ """Fingerprint a legal entity name."""
28
+ # this needs to happen before the replacements
29
+ text = original.lower()
30
+ text = remove_person_prefixes(text)
31
+ # Super hard-core string scrubbing
32
+ cleaned = clean_name_ascii(text)
33
+ if cleaned is None:
34
+ return None
35
+ cleaned = replace_types(cleaned)
36
+ cleaned = squash_spaces(cleaned)
37
+ if len(cleaned) < 1:
38
+ return None
39
+ return cleaned
40
+
41
+
42
+ def names_word_list(
43
+ names: Iterable[str],
44
+ min_length: int = 1,
45
+ ) -> List[str]:
46
+ """Get a list of tokens present in the given set of names."""
47
+ words: List[str] = []
48
+ for name in names:
49
+ normalized = fingerprint_name(name)
50
+ if normalized is None:
51
+ continue
52
+ for word in normalized.split(WS):
53
+ if len(word) >= min_length:
54
+ words.append(word)
55
+ return words
56
+
57
+
58
+ def name_words(name: Optional[str], min_length: int = 1) -> List[str]:
59
+ """Get a list of tokens present in the given name."""
60
+ if name is None:
61
+ return []
62
+ words: List[str] = []
63
+ for word in name.split(WS):
64
+ if len(word) >= min_length:
65
+ words.append(word)
66
+ return words
File without changes
@@ -0,0 +1,42 @@
1
+ from followthemoney import registry, E
2
+
3
+ from nomenklatura.matching.util import type_pair, has_schema
4
+
5
+
6
+ # def obj_country(left: E, right: E) -> float:
7
+ # """Check if two entities share a country."""
8
+ # if has_schema(left, right, "LegalEntity"):
9
+ # return 0.0
10
+ # lv, rv = type_pair(left, right, registry.country)
11
+ # if len(lv) == 0 or len(rv) == 0:
12
+ # return 0.0
13
+ # common = len(set(lv).intersection(rv))
14
+ # return 1.0 if common > 0 else -1.0
15
+ # # if common == 0:
16
+ # # return -1.0
17
+ # # total = len(lv) + len(rv)
18
+ # # return float(common) / total
19
+
20
+
21
+ def org_obj_country_match(left: E, right: E) -> float:
22
+ """Check if two entities share a country."""
23
+ if has_schema(left, right, "LegalEntity") and not has_schema(
24
+ left, right, "Organization"
25
+ ):
26
+ return 0.0
27
+ lv, rv = type_pair(left, right, registry.country)
28
+ if len(lv) == 0 or len(rv) == 0:
29
+ return 0.0
30
+ common = len(set(lv).intersection(rv))
31
+ return 1.0 if common > 0 else -1.0
32
+
33
+
34
+ def per_country_mismatch(left: E, right: E) -> float:
35
+ """Both persons are linked to different countries."""
36
+ if not has_schema(left, right, "Person"):
37
+ return 0.0
38
+ qv, rv = type_pair(left, right, registry.country)
39
+ if len(qv) == 0 or len(rv) == 0:
40
+ return 0.0
41
+ overlap = len(set(qv).intersection(rv))
42
+ return 1.0 if overlap == 0 else -0.2
@@ -0,0 +1,64 @@
1
+ from typing import Set, Tuple
2
+ from rigour.ids import get_strong_format_names
3
+
4
+ from followthemoney import EntityProxy, registry
5
+
6
+
7
+ HONORARY_STRONG = {registry.phone, registry.email, registry.checksum}
8
+ STRONG_FORMATS = get_strong_format_names()
9
+
10
+
11
+ def _get_strong_identifiers(entity: EntityProxy) -> Set[Tuple[str, str]]:
12
+ strong_ids: Set[Tuple[str, str]] = set()
13
+ for prop, value in entity.itervalues():
14
+ if not prop.matchable:
15
+ continue
16
+ if prop.format in STRONG_FORMATS:
17
+ strong_ids.add((prop.format, value))
18
+ elif prop.type in HONORARY_STRONG:
19
+ strong_ids.add((prop.name, value))
20
+ return strong_ids
21
+
22
+
23
+ def _get_weak_identifiers(entity: EntityProxy) -> Set[str]:
24
+ weak_ids: Set[str] = set()
25
+ for prop, value in entity.itervalues():
26
+ if not prop.matchable or not prop.type != registry.identifier:
27
+ continue
28
+ if prop.format in STRONG_FORMATS:
29
+ continue
30
+ weak_ids.add(value)
31
+ return weak_ids
32
+
33
+
34
+ def strong_identifier_match(left: EntityProxy, right: EntityProxy) -> float:
35
+ """Check if two entities share any strong identifiers."""
36
+ left_strong = _get_strong_identifiers(left)
37
+ right_strong = _get_strong_identifiers(right)
38
+ if len(left_strong) == 0 or len(right_strong) == 0:
39
+ return 0.0
40
+ if left_strong.intersection(right_strong):
41
+ return 1.0
42
+ left_nofmt = {v for _, v in left_strong}
43
+ right_nofmt = {v for _, v in right_strong}
44
+ if left_nofmt.intersection(_get_weak_identifiers(right)):
45
+ return 0.7
46
+ if right_nofmt.intersection(_get_weak_identifiers(left)):
47
+ return 0.7
48
+ left_fmts = {f for _, f in left_strong}
49
+ right_fmts = {f for _, f in right_strong}
50
+ common_fmts = left_fmts.intersection(right_fmts)
51
+ return -0.2 * len(common_fmts)
52
+
53
+
54
+ def weak_identifier_match(left: EntityProxy, right: EntityProxy) -> float:
55
+ """Check if two entities share any weak identifiers."""
56
+ left_ids = _get_weak_identifiers(left)
57
+ right_ids = _get_weak_identifiers(right)
58
+ if left_ids.intersection(right_ids):
59
+ return 1.0
60
+ # left_formats = {fmt for fmt, _ in left_ids}
61
+ # right_formats = {fmt for fmt, _ in right_ids}
62
+ # if left_formats.intersection(right_formats):
63
+ # return -0.5
64
+ return 0.0
@@ -0,0 +1,71 @@
1
+ from typing import List, Optional, Set
2
+ from followthemoney import registry, E
3
+
4
+ from nomenklatura.matching.compare.util import extract_numbers
5
+ from nomenklatura.matching.util import type_pair
6
+ from nomenklatura.matching.util import has_schema
7
+
8
+ from rigour.addresses import normalize_address, shorten_address_keywords
9
+
10
+ OTHER = registry.gender.OTHER
11
+
12
+
13
+ def _norm_address(addr: str, latinize: bool = True) -> Optional[str]:
14
+ norm_addr = normalize_address(addr, latinize=latinize, min_length=4)
15
+ if norm_addr is not None:
16
+ norm_addr = shorten_address_keywords(norm_addr, latinize=latinize)
17
+ return norm_addr
18
+
19
+
20
+ def _norm_place(places: List[str]) -> Set[str]:
21
+ parts = set()
22
+ for place in places:
23
+ norm_place = _norm_address(place)
24
+ if norm_place is not None:
25
+ for part in norm_place.split(" "):
26
+ parts.add(part)
27
+ return parts
28
+
29
+
30
+ def birth_place(query: E, result: E) -> float:
31
+ """Same place of birth."""
32
+ if not has_schema(query, result, "Person"):
33
+ return 0.0
34
+ lparts = _norm_place(query.get("birthPlace", quiet=True))
35
+ rparts = _norm_place(result.get("birthPlace", quiet=True))
36
+ overlap = len(lparts.intersection(rparts))
37
+ base_length = max(1.0, min(len(lparts), len(rparts)))
38
+ return overlap / base_length
39
+
40
+
41
+ def address_match(query: E, result: E) -> float:
42
+ """Text similarity between addresses."""
43
+ lv, rv = type_pair(query, result, registry.address)
44
+ lvn = _norm_place(lv)
45
+ rvn = _norm_place(rv)
46
+ if len(lvn) == 0 or len(rvn) == 0:
47
+ return 0.0
48
+ overlap = len(lvn.intersection(rvn))
49
+ tokens = max(1.0, min(len(lvn), len(rvn)))
50
+ if overlap == 0:
51
+ return 0.0
52
+ return float(overlap) / float(tokens)
53
+
54
+
55
+ def address_numbers(query: E, result: E) -> float:
56
+ """Find if names contain numbers, score if the numbers are different."""
57
+ lv, rv = type_pair(query, result, registry.address)
58
+ lvn = extract_numbers(lv)
59
+ rvn = extract_numbers(rv)
60
+ common = len(lvn.intersection(rvn))
61
+ disjoint = len(lvn.difference(rvn))
62
+ return common - disjoint
63
+
64
+
65
+ def gender_mismatch(query: E, result: E) -> float:
66
+ """Both entities have a different gender associated with them."""
67
+ qv = {v for v in query.get("gender", quiet=True) if v != OTHER}
68
+ rv = {v for v in result.get("gender", quiet=True) if v != OTHER}
69
+ if len(qv) == 1 and len(rv) == 1 and len(qv.intersection(rv)) == 0:
70
+ return 1.0
71
+ return 0.0
@@ -0,0 +1,110 @@
1
+ import pickle
2
+ import numpy as np
3
+ from typing import List, Dict, Tuple, cast
4
+ from functools import cache
5
+ from sklearn.pipeline import Pipeline # type: ignore
6
+ from followthemoney import E
7
+
8
+ from nomenklatura.matching.erun.names import name_levenshtein, family_name_match
9
+ from nomenklatura.matching.erun.names import name_token_overlap, name_numbers
10
+ from nomenklatura.matching.erun.names import obj_name_levenshtein
11
+ from nomenklatura.matching.erun.misc import address_match, address_numbers
12
+ from nomenklatura.matching.erun.misc import birth_place
13
+ from nomenklatura.matching.erun.misc import gender_mismatch
14
+ from nomenklatura.matching.erun.countries import (
15
+ org_obj_country_match,
16
+ per_country_mismatch,
17
+ )
18
+ from nomenklatura.matching.erun.identifiers import strong_identifier_match
19
+ from nomenklatura.matching.erun.identifiers import weak_identifier_match
20
+ from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
21
+ from nomenklatura.matching.compare.dates import dob_year_disjoint
22
+ from nomenklatura.matching.types import (
23
+ FeatureDocs,
24
+ FeatureDoc,
25
+ MatchingResult,
26
+ ScoringConfig,
27
+ )
28
+ from nomenklatura.matching.types import CompareFunction, FtResult
29
+ from nomenklatura.matching.types import Encoded, ScoringAlgorithm
30
+ from nomenklatura.matching.util import make_github_url
31
+ from nomenklatura.util import DATA_PATH
32
+
33
+
34
+ class EntityResolveRegression(ScoringAlgorithm):
35
+ """Entity resolution matcher. Do not use this in (regulated) screening scenarios."""
36
+
37
+ NAME = "er-unstable"
38
+ MODEL_PATH = DATA_PATH.joinpath(f"{NAME}.pkl")
39
+ FEATURES: List[CompareFunction] = [
40
+ name_token_overlap,
41
+ name_numbers,
42
+ name_levenshtein,
43
+ strong_identifier_match,
44
+ weak_identifier_match,
45
+ dob_matches,
46
+ dob_year_matches,
47
+ FtResult.unwrap(dob_year_disjoint),
48
+ family_name_match,
49
+ birth_place,
50
+ gender_mismatch,
51
+ per_country_mismatch,
52
+ org_obj_country_match,
53
+ obj_name_levenshtein,
54
+ address_match,
55
+ address_numbers,
56
+ ]
57
+
58
+ @classmethod
59
+ def save(cls, pipe: Pipeline, coefficients: Dict[str, float]) -> None:
60
+ """Store a classification pipeline after training."""
61
+ mdl = pickle.dumps({"pipe": pipe, "coefficients": coefficients})
62
+ with open(cls.MODEL_PATH, "wb") as fh:
63
+ fh.write(mdl)
64
+ cls.load.cache_clear()
65
+
66
+ @classmethod
67
+ @cache
68
+ def load(cls) -> Tuple[Pipeline, Dict[str, float]]:
69
+ """Load a pre-trained classification pipeline for ad-hoc use."""
70
+ with open(cls.MODEL_PATH, "rb") as fh:
71
+ matcher = pickle.loads(fh.read())
72
+ pipe = cast(Pipeline, matcher["pipe"])
73
+ coefficients = cast(Dict[str, float], matcher["coefficients"])
74
+ current = [f.__name__ for f in cls.FEATURES]
75
+ if list(coefficients.keys()) != current:
76
+ raise RuntimeError("Model was not trained on identical features!")
77
+ return pipe, coefficients
78
+
79
+ @classmethod
80
+ def get_feature_docs(cls) -> FeatureDocs:
81
+ """Return an explanation of the features and their coefficients."""
82
+ features: FeatureDocs = {}
83
+ _, coefficients = cls.load()
84
+ for func in cls.FEATURES:
85
+ name = func.__name__
86
+ features[name] = FeatureDoc(
87
+ description=func.__doc__,
88
+ coefficient=float(coefficients[name]),
89
+ url=make_github_url(func),
90
+ )
91
+ return features
92
+
93
+ @classmethod
94
+ def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
95
+ """Use a regression model to compare two entities."""
96
+ pipe, _ = cls.load()
97
+ encoded = cls.encode_pair(query, result)
98
+ npfeat = np.array([encoded])
99
+ pred = pipe.predict_proba(npfeat)
100
+ score = cast(float, pred[0][1])
101
+ explanations: Dict[str, FtResult] = {}
102
+ for feature, coeff in zip(cls.FEATURES, encoded):
103
+ name = feature.__name__
104
+ explanations[name] = FtResult(score=float(coeff), detail=None)
105
+ return MatchingResult.make(score=score, explanations=explanations)
106
+
107
+ @classmethod
108
+ def encode_pair(cls, left: E, right: E) -> Encoded:
109
+ """Encode the comparison between two entities as a set of feature values."""
110
+ return [f(left, right) for f in cls.FEATURES]
@@ -0,0 +1,126 @@
1
+ from functools import lru_cache
2
+ from typing import Set
3
+ from followthemoney import EntityProxy, registry, E
4
+ from followthemoney.names import schema_type_tag
5
+ from rigour.text.distance import levenshtein_similarity
6
+ from rigour.names import Name, NameTypeTag
7
+ from rigour.names import is_stopword
8
+ from rigour.names import remove_org_prefixes, remove_obj_prefixes
9
+ from rigour.names import remove_person_prefixes
10
+ from rigour.names import replace_org_types_compare
11
+
12
+ from nomenklatura.matching.erun.util import compare_levenshtein
13
+ from nomenklatura.matching.util import max_in_sets, has_schema
14
+ from nomenklatura.util import unroll
15
+
16
+
17
+ @lru_cache(maxsize=512)
18
+ def _entity_names(entity: EntityProxy) -> Set[Name]:
19
+ names: Set[Name] = set()
20
+ tag = schema_type_tag(entity.schema)
21
+ for string in entity.get_type_values(registry.name, matchable=True):
22
+ if tag in (NameTypeTag.ORG, NameTypeTag.ENT):
23
+ string = replace_org_types_compare(string)
24
+ string = remove_org_prefixes(string)
25
+ elif tag == NameTypeTag.PER:
26
+ string = remove_person_prefixes(string)
27
+ else:
28
+ string = remove_obj_prefixes(string)
29
+ n = Name(string, tag=tag)
30
+ names.add(n)
31
+ return names
32
+
33
+
34
+ def name_levenshtein(left: E, right: E) -> float:
35
+ """Consider the edit distance (as a fraction of name length) between the two most
36
+ similar names linked to both entities."""
37
+ if not has_schema(left, right, "LegalEntity"):
38
+ return 0.0
39
+ if has_schema(left, right, "Person"):
40
+ left_names: Set[str] = set()
41
+ for name in _entity_names(left):
42
+ left_names.add(" ".join(sorted(part.comparable for part in name.parts)))
43
+ left_names.add(name.comparable)
44
+ right_names: Set[str] = set()
45
+ for name in _entity_names(right):
46
+ right_names.add(" ".join(sorted(part.comparable for part in name.parts)))
47
+ right_names.add(name.comparable)
48
+ else:
49
+ left_names = {n.comparable for n in _entity_names(left)}
50
+ right_names = {n.comparable for n in _entity_names(right)}
51
+ return max_in_sets(left_names, right_names, compare_levenshtein)
52
+
53
+
54
+ def _entity_lastnames(entity: EntityProxy) -> Set[str]:
55
+ names: Set[str] = set()
56
+ for string in entity.get("lastName", quiet=True):
57
+ n = Name(string, tag=NameTypeTag.PER)
58
+ for part in n.parts:
59
+ if len(part.comparable) > 2 and not is_stopword(part.form):
60
+ names.add(part.comparable)
61
+ return names
62
+
63
+
64
+ def family_name_match(left: E, right: E) -> float:
65
+ """Matching family name between the two entities."""
66
+ if not has_schema(left, right, "Person"):
67
+ return 0.0
68
+ lnames = _entity_lastnames(left)
69
+ rnames = _entity_lastnames(right)
70
+ if len(lnames) == 0 or len(rnames) == 0:
71
+ return 0.0
72
+ overlap = lnames.intersection(rnames)
73
+ return -1.0 if len(overlap) == 0 else 1.0
74
+
75
+
76
+ def _name_tokens(entity: EntityProxy) -> Set[str]:
77
+ tokens: Set[str] = set()
78
+ for name in _entity_names(entity):
79
+ for part in name.parts:
80
+ cmp = part.comparable
81
+ if len(cmp) > 2 and not is_stopword(part.form):
82
+ tokens.add(cmp)
83
+ return tokens
84
+
85
+
86
+ def name_token_overlap(left: E, right: E) -> float:
87
+ """Evaluate the proportion of identical words in each name."""
88
+ left_tokens = _name_tokens(left)
89
+ right_tokens = _name_tokens(right)
90
+ common = left_tokens.intersection(right_tokens)
91
+ tokens = min(len(left_tokens), len(right_tokens))
92
+ return float(len(common)) / float(max(2.0, tokens))
93
+
94
+
95
+ def name_numbers(left: E, right: E) -> float:
96
+ """Find if names contain numbers, score if the numbers are different."""
97
+ left_names = [n.parts for n in _entity_names(left)]
98
+ right_names = [n.parts for n in _entity_names(right)]
99
+ left_numbers = {p.comparable for p in unroll(left_names) if p.numeric}
100
+ right_numbers = {p.comparable for p in unroll(right_names) if p.numeric}
101
+ total = len(left_numbers) + len(right_numbers)
102
+ if total == 0:
103
+ return 0.0
104
+ common = len(left_numbers.intersection(right_numbers))
105
+ if common == 0 and len(left_numbers) > 0 and len(right_numbers) > 0:
106
+ # If both names contain numbers, but they are different, this is a strong
107
+ # signal that the names are not the same.
108
+ return -1.0
109
+ return common / float(total)
110
+
111
+
112
+ def _compare_strict_levenshtein(left: str, right: str) -> float:
113
+ """A stricter version of levenshtein that returns 0.0 if the names are too
114
+ different in length."""
115
+ max_edits = min(2, max(len(left), len(right)) // 4)
116
+ score = levenshtein_similarity(left, right, max_edits=max_edits)
117
+ return score**2
118
+
119
+
120
+ def obj_name_levenshtein(left: E, right: E) -> float:
121
+ """Very strict name comparison on object (Vessel, RealEstate, Security) names."""
122
+ if has_schema(left, right, "LegalEntity"):
123
+ return 0.0
124
+ left_names = {n.comparable for n in _entity_names(left)}
125
+ right_names = {n.comparable for n in _entity_names(right)}
126
+ return max_in_sets(left_names, right_names, _compare_strict_levenshtein)
@@ -0,0 +1,135 @@
1
+ import logging
2
+ import multiprocessing
3
+ import random
4
+ from concurrent.futures import ProcessPoolExecutor
5
+ from pprint import pprint
6
+ from typing import Iterable, List, Tuple
7
+
8
+ import numpy as np
9
+ from followthemoney import registry, EntityProxy
10
+ from followthemoney.util import PathLike
11
+ from numpy.typing import NDArray
12
+ from sklearn import metrics # type: ignore
13
+ from sklearn.linear_model import LogisticRegression # type: ignore
14
+ from sklearn.model_selection import train_test_split # type: ignore
15
+ from sklearn.pipeline import make_pipeline # type: ignore
16
+ from sklearn.preprocessing import StandardScaler # type: ignore
17
+
18
+ from nomenklatura.judgement import Judgement
19
+ from nomenklatura.matching.erun.model import EntityResolveRegression
20
+ from nomenklatura.matching.pairs import JudgedPair, read_pairs
21
+
22
+ log = logging.getLogger(__name__)
23
+
24
+
25
+ def pair_convert(pair: JudgedPair) -> Tuple[List[float], int]:
26
+ """Encode a pair of training data into features and target."""
27
+ judgement = 1 if pair.judgement == Judgement.POSITIVE else 0
28
+ features = EntityResolveRegression.encode_pair(pair.left, pair.right)
29
+ return features, judgement
30
+
31
+
32
+ def pairs_to_arrays(
33
+ pairs: Iterable[JudgedPair],
34
+ ) -> Tuple[NDArray[np.float32], NDArray[np.float32]]:
35
+ """Parallelize feature computation for training data"""
36
+ xrows = []
37
+ yrows = []
38
+ threads = multiprocessing.cpu_count()
39
+ log.info("Compute threads: %d", threads)
40
+ with ProcessPoolExecutor(max_workers=threads) as executor:
41
+ results = executor.map(pair_convert, pairs, chunksize=1000)
42
+ for idx, (x, y) in enumerate(results):
43
+ if idx > 0 and idx % 10000 == 0:
44
+ log.info("Computing features: %s....", idx)
45
+ xrows.append(x)
46
+ yrows.append(y)
47
+
48
+ return np.array(xrows), np.array(yrows)
49
+
50
+
51
+ def _entity_weight(entity: EntityProxy) -> float:
52
+ """This weights up entities with more matchable properties, to push down the
53
+ value of name-only matches."""
54
+ weight = 0.0
55
+ # types = set()
56
+ for prop, _ in entity.itervalues():
57
+ if prop.matchable:
58
+ inc_weight = 0.2 if prop.type == registry.name else 1.0
59
+ weight += inc_weight
60
+ # types.add(prop.type)
61
+ # if entity.schema.is_a("LegalEntity") and types == {registry.name}:
62
+ # weight = weight * 0.5
63
+ return weight
64
+
65
+
66
+ def weighted_pair_sort(pairs: List[JudgedPair]) -> List[JudgedPair]:
67
+ for pair in pairs:
68
+ left_weight = _entity_weight(pair.left)
69
+ right_weight = _entity_weight(pair.right)
70
+ # pair.weight = (left_weight + right_weight) / 2.0
71
+ pair.weight = min(left_weight, right_weight)
72
+ return sorted(pairs, key=lambda p: -p.weight)
73
+
74
+
75
+ def build_dataset(
76
+ pairs_file: PathLike,
77
+ ) -> Tuple[NDArray[np.float32], NDArray[np.float32]]:
78
+ """Load and balance a dataset from a JSON file."""
79
+ pairs = []
80
+ for pair in read_pairs(pairs_file):
81
+ if not pair.left.schema.matchable or not pair.right.schema.matchable:
82
+ continue
83
+ if pair.left.schema.is_a("Address") or pair.right.schema.is_a("Address"):
84
+ continue
85
+ if pair.judgement == Judgement.UNSURE:
86
+ pair.judgement = Judgement.NEGATIVE
87
+ pairs.append(pair)
88
+ positive = [p for p in pairs if p.judgement == Judgement.POSITIVE]
89
+ negative = [p for p in pairs if p.judgement == Judgement.NEGATIVE]
90
+ log.info(
91
+ "Total pairs loaded: %d (%d pos/%d neg)",
92
+ len(pairs),
93
+ len(positive),
94
+ len(negative),
95
+ )
96
+ min_class = min(len(positive), len(negative))
97
+ log.info("Downsampling to %d per class", min_class)
98
+ if len(positive) > min_class:
99
+ positive = weighted_pair_sort(positive)
100
+ pairs = positive[:min_class] + negative
101
+ else:
102
+ negative = weighted_pair_sort(negative)
103
+ pairs = positive + negative[:min_class]
104
+ random.shuffle(pairs)
105
+ log.info("Training pairs after downsampling: %d", len(pairs))
106
+ return pairs_to_arrays(pairs)
107
+
108
+
109
+ def train_matcher(pairs_file: PathLike) -> None:
110
+ X, y = build_dataset(pairs_file)
111
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
112
+ # logreg = LogisticRegression(class_weight={0: 95, 1: 1})
113
+ # logreg = LogisticRegression(penalty="l1", solver="liblinear")
114
+ logreg = LogisticRegression(penalty="l2")
115
+ log.info("Training model...")
116
+ pipe = make_pipeline(StandardScaler(), logreg)
117
+ pipe.fit(X_train, y_train)
118
+ coef = logreg.coef_[0]
119
+ coefficients = {
120
+ n.__name__: c for n, c in zip(EntityResolveRegression.FEATURES, coef)
121
+ }
122
+ EntityResolveRegression.save(pipe, coefficients)
123
+ print("Written to: %s" % EntityResolveRegression.MODEL_PATH.as_posix())
124
+ print("Coefficients:")
125
+ pprint(coefficients)
126
+ y_pred = pipe.predict(X_test)
127
+ cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
128
+ print("Confusion matrix:\n", cnf_matrix)
129
+ print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
130
+ print("Precision:", metrics.precision_score(y_test, y_pred))
131
+ print("Recall:", metrics.recall_score(y_test, y_pred))
132
+
133
+ y_pred_proba = pipe.predict_proba(X_test)[::, 1]
134
+ auc = metrics.roc_auc_score(y_test, y_pred_proba)
135
+ print("Area under curve:", auc)
@@ -0,0 +1,28 @@
1
+ from normality import ascii_text
2
+ from typing import Iterable, Set, Tuple
3
+ from rigour.text.distance import levenshtein
4
+ from rigour.names import tokenize_name
5
+
6
+
7
+ def tokenize(texts: Iterable[str]) -> Set[str]:
8
+ tokens: Set[str] = set()
9
+ for text in texts:
10
+ text = text.casefold()
11
+ for token in tokenize_name(text):
12
+ ascii_token = ascii_text(token)
13
+ if ascii_token is not None and len(ascii_token) > 2:
14
+ tokens.add(ascii_token)
15
+ return tokens
16
+
17
+
18
+ def tokenize_pair(
19
+ pair: Tuple[Iterable[str], Iterable[str]],
20
+ ) -> Tuple[Set[str], Set[str]]:
21
+ return tokenize(pair[0]), tokenize(pair[1])
22
+
23
+
24
+ def compare_levenshtein(left: str, right: str) -> float:
25
+ distance = levenshtein(left, right)
26
+ base = max((1, len(left), len(right)))
27
+ return 1.0 - (distance / float(base))
28
+ # return math.sqrt(distance)
File without changes