nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,86 @@
1
+ from itertools import product
2
+ from typing import Iterable, Set
3
+ from prefixdate import Precision
4
+ from followthemoney.proxy import E
5
+ from followthemoney.types import registry
6
+ from rigour.text.distance import levenshtein
7
+ from rigour.ids import StrictFormat
8
+
9
+ from nomenklatura.matching.compare.util import clean_map
10
+ from nomenklatura.matching.types import FtResult, ScoringConfig
11
+ from nomenklatura.matching.util import has_schema, props_pair, type_pair
12
+
13
+
14
+ def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
15
+ dates = set()
16
+ for value in values:
17
+ if len(value) >= precision.value:
18
+ dates.add(value[: precision.value])
19
+ return dates
20
+
21
+
22
+ def _flip_day_month(value: str) -> str:
23
+ # This is such a common mistake we want to accomodate it.
24
+ year, month, day = value.split("-", 2)
25
+ return f"{year}-{day}-{month}"
26
+
27
+
28
+ def dob_day_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
29
+ """The birth date of the two entities is not the same."""
30
+ query_dates, result_dates = props_pair(query, result, ["birthDate"])
31
+ if len(query_dates) == 0 or len(result_dates) == 0:
32
+ return FtResult(score=0.0, detail="No birth dates provided")
33
+ result_days = _dates_precision(result_dates, Precision.DAY)
34
+ query_days = _dates_precision(query_dates, Precision.DAY)
35
+ if len(result_days) == 0 or len(query_days) == 0:
36
+ return FtResult(score=0.0, detail="Birth days don't include day precision")
37
+ overlap = query_days.intersection(result_days)
38
+ if len(overlap) > 0:
39
+ return FtResult(score=0.0, detail=f"Birth day match: {', '.join(overlap)}")
40
+ query_flipped = set([_flip_day_month(d) for d in query_days])
41
+ overlap = query_flipped.intersection(result_days)
42
+ if len(overlap) > 0:
43
+ detail = f"Birth day flipped match: {', '.join(overlap)}"
44
+ return FtResult(score=0.5, detail=detail)
45
+ return FtResult(score=1.0, detail="Birth day mis-match")
46
+
47
+
48
+ def dob_year_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
49
+ """The birth date of the two entities is not the same."""
50
+ query_dates, result_dates = props_pair(query, result, ["birthDate"])
51
+ query_years = _dates_precision(query_dates, Precision.YEAR)
52
+ result_years = _dates_precision(result_dates, Precision.YEAR)
53
+ if len(query_years) == 0 or len(result_years) == 0:
54
+ return FtResult(score=0.0, detail="No birth years provided")
55
+ overlap = query_years.intersection(result_years)
56
+ if len(overlap) > 0:
57
+ detail = f"Birth year match: {', '.join(overlap)}"
58
+ return FtResult(score=0.0, detail=detail)
59
+ return FtResult(score=1.0, detail="Birth year mis-match")
60
+
61
+
62
+ def orgid_disjoint(query: E, result: E, config: ScoringConfig) -> FtResult:
63
+ """Two companies or organizations have different tax identifiers or registration
64
+ numbers."""
65
+ if not has_schema(query, result, "Organization"):
66
+ return FtResult(score=0.0, detail="Neither entity is an organization")
67
+ query_ids_, result_ids_ = type_pair(query, result, registry.identifier)
68
+ query_ids = clean_map(query_ids_, StrictFormat.normalize)
69
+ result_ids = clean_map(result_ids_, StrictFormat.normalize)
70
+ if not len(query_ids) or not len(result_ids):
71
+ return FtResult(score=0.0, detail="Neither entity has identifiers")
72
+ common = query_ids.intersection(result_ids)
73
+ if len(common) > 0:
74
+ return FtResult(score=0.0, detail="Common identifiers: %s" % ", ".join(common))
75
+ max_ratio = 0.0
76
+ for query_id, result_id in product(query_ids, result_ids):
77
+ distance = levenshtein(query_id, result_id)
78
+ max_len = max(len(query_id), len(result_id))
79
+ ratio = 1.0 - (distance / float(max_len))
80
+ if ratio > 0.7:
81
+ max_ratio = max(max_ratio, ratio)
82
+ detail = "Mismatched identifiers: %s vs %s" % (
83
+ ", ".join(query_ids),
84
+ ", ".join(result_ids),
85
+ )
86
+ return FtResult(score=1 - max_ratio, detail=detail)
@@ -0,0 +1,59 @@
1
+ from typing import Dict
2
+
3
+ from nomenklatura.matching.types import Feature, HeuristicAlgorithm
4
+ from nomenklatura.matching.compare.countries import country_mismatch
5
+ from nomenklatura.matching.compare.gender import gender_mismatch
6
+ from nomenklatura.matching.name_based.misc import orgid_disjoint
7
+ from nomenklatura.matching.name_based.misc import dob_day_disjoint, dob_year_disjoint
8
+ from nomenklatura.matching.name_based.names import jaro_name_parts
9
+ from nomenklatura.matching.name_based.names import soundex_name_parts
10
+
11
+
12
+ class NameMatcher(HeuristicAlgorithm):
13
+ """An algorithm that matches on entity name, using phonetic comparisons and edit
14
+ distance to generate potential matches. This implementation is vaguely based on
15
+ the behaviour proposed by the US OFAC documentation (FAQ #249)."""
16
+
17
+ # Try to re-produce results from: https://sanctionssearch.ofac.treas.gov/
18
+ # cf. https://ofac.treasury.gov/faqs/topic/1636
19
+
20
+ NAME = "name-based"
21
+ features = [
22
+ Feature(func=jaro_name_parts, weight=0.5),
23
+ Feature(func=soundex_name_parts, weight=0.5),
24
+ ]
25
+
26
+ @classmethod
27
+ def compute_score(
28
+ cls, scores: Dict[str, float], weights: Dict[str, float]
29
+ ) -> float:
30
+ score = 0.0
31
+ for feat in cls.features:
32
+ score += scores.get(feat.name, 0.0) * weights.get(feat.name, 0.0)
33
+ return score
34
+
35
+
36
+ class NameQualifiedMatcher(HeuristicAlgorithm):
37
+ """Same as the name-based algorithm, but scores will be reduced if a mis-match
38
+ of birth dates and nationalities is found for persons, or different
39
+ tax/registration identifiers are included for organizations and companies."""
40
+
41
+ NAME = "name-qualified"
42
+ features = [
43
+ Feature(func=jaro_name_parts, weight=0.5),
44
+ Feature(func=soundex_name_parts, weight=0.5),
45
+ Feature(func=country_mismatch, weight=-0.1, qualifier=True),
46
+ Feature(func=dob_year_disjoint, weight=-0.1, qualifier=True),
47
+ Feature(func=dob_day_disjoint, weight=-0.15, qualifier=True),
48
+ Feature(func=gender_mismatch, weight=-0.1, qualifier=True),
49
+ Feature(func=orgid_disjoint, weight=-0.1, qualifier=True),
50
+ ]
51
+
52
+ @classmethod
53
+ def compute_score(
54
+ cls, scores: Dict[str, float], weights: Dict[str, float]
55
+ ) -> float:
56
+ score = 0.0
57
+ for feat in cls.features:
58
+ score += scores.get(feat.name, 0.0) * weights.get(feat.name, 0.0)
59
+ return score
@@ -0,0 +1,59 @@
1
+ from typing import List, Optional, Tuple
2
+ from followthemoney.proxy import E
3
+ from followthemoney.types import registry
4
+ from rigour.text.distance import jaro_winkler
5
+ from rigour.text.phonetics import soundex
6
+
7
+ from nomenklatura.matching.types import FtResult, ScoringConfig
8
+ from nomenklatura.matching.util import type_pair
9
+ from nomenklatura.matching.compat import names_word_list
10
+
11
+
12
+ def _soundex_token(token: str) -> str:
13
+ if token.isalpha() and len(token) > 1:
14
+ out = soundex(token)
15
+ # doesn't handle non-ascii characters
16
+ if len(out):
17
+ return out
18
+ return token.upper()
19
+
20
+
21
+ def soundex_name_parts(query: E, result: E, config: ScoringConfig) -> FtResult:
22
+ """Compare two sets of name parts using the phonetic matching."""
23
+ query_names_, result_names_ = type_pair(query, result, registry.name)
24
+ query_soundex = set([_soundex_token(p) for p in names_word_list(query_names_)])
25
+ result_soundex = set([_soundex_token(p) for p in names_word_list(result_names_)])
26
+ overlap = query_soundex.intersection(result_soundex)
27
+ if len(overlap) == 0:
28
+ return FtResult(score=0.0, detail=None)
29
+ min_len = min(len(query_soundex), len(result_soundex))
30
+ score = len(overlap) / float(max(1.0, min_len))
31
+ detail = f"Matched {len(overlap)} tokens: {', '.join(overlap)}"
32
+ return FtResult(score=score, detail=detail)
33
+
34
+
35
+ def jaro_name_parts(query: E, result: E, config: ScoringConfig) -> FtResult:
36
+ """Compare two sets of name parts using the Jaro-Winkler string similarity
37
+ algorithm."""
38
+ query_names_, result_names_ = type_pair(query, result, registry.name)
39
+ result_parts = set(names_word_list(result_names_))
40
+ similiarities: List[float] = []
41
+ tokens: List[Tuple[str, str]] = []
42
+ for part in set(names_word_list(query_names_)):
43
+ best = 0.0
44
+ best_token: Optional[str] = None
45
+
46
+ for other in result_parts:
47
+ part_similarity = jaro_winkler(part, other)
48
+ if part_similarity > 0.5 and part_similarity > best:
49
+ best = part_similarity
50
+ best_token = other
51
+
52
+ similiarities.append(best)
53
+ if best_token is not None:
54
+ tokens.append((part, best_token))
55
+ if len(similiarities) == 0:
56
+ return FtResult(score=0.0, detail=None)
57
+ score = sum(similiarities) / float(max(1.0, len(similiarities)))
58
+ mapping = ", ".join(f"{a} -> {b}" for a, b in tokens)
59
+ return FtResult(score=score, detail=f"Matched {len(tokens)} tokens: {mapping}")
@@ -0,0 +1,42 @@
1
+ import json
2
+ from typing import Generator, Dict, Any
3
+ from followthemoney.proxy import EntityProxy
4
+ from followthemoney.util import PathLike
5
+
6
+ from nomenklatura.judgement import Judgement
7
+
8
+
9
+ class JudgedPair(object):
10
+ """A pair of two entities which have been judged to be the same
11
+ (or not) by a user."""
12
+
13
+ __slots__ = ("left", "right", "weight", "judgement")
14
+
15
+ def __init__(
16
+ self, left: EntityProxy, right: EntityProxy, judgement: Judgement
17
+ ) -> None:
18
+ self.left = left
19
+ self.right = right
20
+ self.judgement = judgement
21
+ self.weight = 0.0
22
+
23
+ def to_dict(self) -> Dict[str, Any]:
24
+ return {
25
+ "left": self.left.to_dict(),
26
+ "right": self.right.to_dict(),
27
+ "judgement": self.judgement.value,
28
+ "weight": self.weight,
29
+ }
30
+
31
+
32
+ def read_pairs(pairs_file: PathLike) -> Generator[JudgedPair, None, None]:
33
+ """Read judgement pairs (training data) from a JSON file."""
34
+ with open(pairs_file, "r") as fh:
35
+ while line := fh.readline():
36
+ data = json.loads(line)
37
+ left_entity = EntityProxy.from_dict(data["left"])
38
+ right_entity = EntityProxy.from_dict(data["right"])
39
+ judgement = Judgement(data["judgement"])
40
+ if judgement not in (Judgement.POSITIVE, Judgement.NEGATIVE):
41
+ continue
42
+ yield JudgedPair(left_entity, right_entity, judgement)
File without changes
@@ -0,0 +1,75 @@
1
+ from followthemoney.proxy import E
2
+ from followthemoney.types import registry
3
+
4
+ from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
5
+ from nomenklatura.matching.compare.util import has_overlap, extract_numbers, is_disjoint
6
+ from nomenklatura.matching.util import props_pair, type_pair
7
+ from nomenklatura.matching.util import max_in_sets, has_schema
8
+ from nomenklatura.matching.compat import clean_name_ascii
9
+
10
+
11
+ def birth_place(query: E, result: E) -> float:
12
+ """Same place of birth."""
13
+ lv, rv = tokenize_pair(props_pair(query, result, ["birthPlace"]))
14
+ tokens = min(len(lv), len(rv))
15
+ return float(len(lv.intersection(rv))) / float(max(2.0, tokens))
16
+
17
+
18
+ def address_match(query: E, result: E) -> float:
19
+ """Text similarity between addresses."""
20
+ lv, rv = type_pair(query, result, registry.address)
21
+ lvn = [clean_name_ascii(v) for v in lv]
22
+ rvn = [clean_name_ascii(v) for v in rv]
23
+ return max_in_sets(lvn, rvn, compare_levenshtein)
24
+
25
+
26
+ def address_numbers(query: E, result: E) -> float:
27
+ """Find if names contain numbers, score if the numbers are different."""
28
+ lv, rv = type_pair(query, result, registry.address)
29
+ lvn = extract_numbers(lv)
30
+ rvn = extract_numbers(rv)
31
+ common = len(lvn.intersection(rvn))
32
+ disjoint = len(lvn.difference(rvn))
33
+ return common - disjoint
34
+
35
+
36
+ def phone_match(query: E, result: E) -> float:
37
+ """Matching phone numbers between the two entities."""
38
+ lv, rv = type_pair(query, result, registry.phone)
39
+ return 1.0 if has_overlap(lv, rv) else 0.0
40
+
41
+
42
+ def email_match(query: E, result: E) -> float:
43
+ """Matching email addresses between the two entities."""
44
+ lv, rv = type_pair(query, result, registry.email)
45
+ return 1.0 if has_overlap(lv, rv) else 0.0
46
+
47
+
48
+ def identifier_match(query: E, result: E) -> float:
49
+ """Matching identifiers (e.g. passports, national ID cards, registration or
50
+ tax numbers) between the two entities."""
51
+ if has_schema(query, result, "Organization"):
52
+ return 0.0
53
+ lv, rv = type_pair(query, result, registry.identifier)
54
+ return 1.0 if has_overlap(lv, rv) else 0.0
55
+
56
+
57
+ def org_identifier_match(query: E, result: E) -> float:
58
+ """Matching identifiers (e.g. registration or tax numbers) between two
59
+ organizations or companies."""
60
+ if not has_schema(query, result, "Organization"):
61
+ return 0.0
62
+ lv, rv = type_pair(query, result, registry.identifier)
63
+ return 1.0 if has_overlap(lv, rv) else 0.0
64
+
65
+
66
+ def gender_mismatch(query: E, result: E) -> float:
67
+ """Both entities have a different gender associated with them."""
68
+ qv, rv = props_pair(query, result, ["gender"])
69
+ return 1.0 if is_disjoint(qv, rv) else 0.0
70
+
71
+
72
+ def country_mismatch(query: E, result: E) -> float:
73
+ """Both entities are linked to different countries."""
74
+ qv, rv = type_pair(query, result, registry.country)
75
+ return 1.0 if is_disjoint(qv, rv) else 0.0
@@ -0,0 +1,110 @@
1
+ import pickle
2
+ import numpy as np
3
+ from typing import List, Dict, Tuple, cast
4
+ from functools import cache
5
+ from sklearn.pipeline import Pipeline # type: ignore
6
+ from followthemoney.proxy import E
7
+
8
+ from nomenklatura.matching.regression_v1.names import first_name_match
9
+ from nomenklatura.matching.regression_v1.names import family_name_match
10
+ from nomenklatura.matching.regression_v1.names import name_levenshtein, name_match
11
+ from nomenklatura.matching.regression_v1.names import name_token_overlap, name_numbers
12
+ from nomenklatura.matching.regression_v1.misc import phone_match, email_match
13
+ from nomenklatura.matching.regression_v1.misc import address_match, address_numbers
14
+ from nomenklatura.matching.regression_v1.misc import identifier_match, birth_place
15
+ from nomenklatura.matching.regression_v1.misc import org_identifier_match
16
+ from nomenklatura.matching.regression_v1.misc import gender_mismatch
17
+ from nomenklatura.matching.regression_v1.misc import country_mismatch
18
+ from nomenklatura.matching.compare.dates import dob_matches, dob_year_matches
19
+ from nomenklatura.matching.compare.dates import dob_year_disjoint
20
+ from nomenklatura.matching.types import (
21
+ FeatureDocs,
22
+ FeatureDoc,
23
+ MatchingResult,
24
+ ScoringConfig,
25
+ )
26
+ from nomenklatura.matching.types import CompareFunction, FtResult
27
+ from nomenklatura.matching.types import Encoded, ScoringAlgorithm
28
+ from nomenklatura.matching.util import make_github_url
29
+ from nomenklatura.util import DATA_PATH
30
+
31
+
32
+ class RegressionV1(ScoringAlgorithm):
33
+ """A simple matching algorithm based on a regression model."""
34
+
35
+ NAME = "regression-v1"
36
+ MODEL_PATH = DATA_PATH.joinpath(f"{NAME}.pkl")
37
+ FEATURES: List[CompareFunction] = [
38
+ name_match,
39
+ name_token_overlap,
40
+ name_numbers,
41
+ name_levenshtein,
42
+ phone_match,
43
+ email_match,
44
+ identifier_match,
45
+ dob_matches,
46
+ dob_year_matches,
47
+ FtResult.unwrap(dob_year_disjoint),
48
+ first_name_match,
49
+ family_name_match,
50
+ birth_place,
51
+ gender_mismatch,
52
+ country_mismatch,
53
+ org_identifier_match,
54
+ address_match,
55
+ address_numbers,
56
+ ]
57
+
58
+ @classmethod
59
+ def save(cls, pipe: Pipeline, coefficients: Dict[str, float]) -> None:
60
+ """Store a classification pipeline after training."""
61
+ mdl = pickle.dumps({"pipe": pipe, "coefficients": coefficients})
62
+ with open(cls.MODEL_PATH, "wb") as fh:
63
+ fh.write(mdl)
64
+ cls.load.cache_clear()
65
+
66
+ @classmethod
67
+ @cache
68
+ def load(cls) -> Tuple[Pipeline, Dict[str, float]]:
69
+ """Load a pre-trained classification pipeline for ad-hoc use."""
70
+ with open(cls.MODEL_PATH, "rb") as fh:
71
+ matcher = pickle.loads(fh.read())
72
+ pipe = cast(Pipeline, matcher["pipe"])
73
+ coefficients = cast(Dict[str, float], matcher["coefficients"])
74
+ current = [f.__name__ for f in cls.FEATURES]
75
+ if list(coefficients.keys()) != current:
76
+ raise RuntimeError("Model was not trained on identical features!")
77
+ return pipe, coefficients
78
+
79
+ @classmethod
80
+ def get_feature_docs(cls) -> FeatureDocs:
81
+ """Return an explanation of the features and their coefficients."""
82
+ features: FeatureDocs = {}
83
+ _, coefficients = cls.load()
84
+ for func in cls.FEATURES:
85
+ name = func.__name__
86
+ features[name] = FeatureDoc(
87
+ description=func.__doc__,
88
+ coefficient=float(coefficients[name]),
89
+ url=make_github_url(func),
90
+ )
91
+ return features
92
+
93
+ @classmethod
94
+ def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
95
+ """Use a regression model to compare two entities."""
96
+ pipe, _ = cls.load()
97
+ encoded = cls.encode_pair(query, result)
98
+ npfeat = np.array([encoded])
99
+ pred = pipe.predict_proba(npfeat)
100
+ score = cast(float, pred[0][1])
101
+ explanations: Dict[str, FtResult] = {}
102
+ for feature, coeff in zip(cls.FEATURES, encoded):
103
+ name = feature.__name__
104
+ explanations[name] = FtResult(score=float(coeff), detail=None)
105
+ return MatchingResult.make(score=score, explanations=explanations)
106
+
107
+ @classmethod
108
+ def encode_pair(cls, left: E, right: E) -> Encoded:
109
+ """Encode the comparison between two entities as a set of feature values."""
110
+ return [f(left, right) for f in cls.FEATURES]
@@ -0,0 +1,63 @@
1
+ from typing import Iterable, Set
2
+ from followthemoney.proxy import E
3
+ from followthemoney.types import registry
4
+
5
+ from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
6
+ from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers
7
+ from nomenklatura.matching.util import props_pair, type_pair
8
+ from nomenklatura.matching.util import max_in_sets
9
+ from nomenklatura.matching.compat import fingerprint_name
10
+
11
+
12
+ def normalize_names(raws: Iterable[str]) -> Set[str]:
13
+ names = set()
14
+ for raw in raws:
15
+ name = fingerprint_name(raw)
16
+ if name is not None:
17
+ names.add(name[:128])
18
+ return names
19
+
20
+
21
+ def name_levenshtein(left: E, right: E) -> float:
22
+ """Consider the edit distance (as a fraction of name length) between the two most
23
+ similar names linked to both entities."""
24
+ lv, rv = type_pair(left, right, registry.name)
25
+ lvn, rvn = normalize_names(lv), normalize_names(rv)
26
+ return max_in_sets(lvn, rvn, compare_levenshtein)
27
+
28
+
29
+ def first_name_match(left: E, right: E) -> float:
30
+ """Matching first/given name between the two entities."""
31
+ lv, rv = tokenize_pair(props_pair(left, right, ["firstName"]))
32
+ return 1.0 if has_overlap(lv, rv) else 0.0
33
+
34
+
35
+ def family_name_match(left: E, right: E) -> float:
36
+ """Matching family name between the two entities."""
37
+ lv, rv = tokenize_pair(props_pair(left, right, ["lastName"]))
38
+ return 1.0 if has_overlap(lv, rv) else 0.0
39
+
40
+
41
+ def name_match(left: E, right: E) -> float:
42
+ """Check for exact name matches between the two entities."""
43
+ lv, rv = type_pair(left, right, registry.name)
44
+ lvn, rvn = normalize_names(lv), normalize_names(rv)
45
+ common = [len(n) for n in lvn.intersection(rvn)]
46
+ max_common = max(common, default=0)
47
+ if max_common == 0:
48
+ return 0.0
49
+ return float(max_common)
50
+
51
+
52
+ def name_token_overlap(left: E, right: E) -> float:
53
+ """Evaluate the proportion of identical words in each name."""
54
+ lv, rv = tokenize_pair(type_pair(left, right, registry.name))
55
+ common = lv.intersection(rv)
56
+ tokens = min(len(lv), len(rv))
57
+ return float(len(common)) / float(max(2.0, tokens))
58
+
59
+
60
+ def name_numbers(left: E, right: E) -> float:
61
+ """Find if names contain numbers, score if the numbers are different."""
62
+ lv, rv = type_pair(left, right, registry.name)
63
+ return 1.0 if is_disjoint(extract_numbers(lv), extract_numbers(rv)) else 0.0
@@ -0,0 +1,87 @@
1
+ import logging
2
+ import numpy as np
3
+ import multiprocessing
4
+ from typing import Iterable, List, Tuple
5
+ from pprint import pprint
6
+ from numpy.typing import NDArray
7
+ from sklearn.pipeline import make_pipeline # type: ignore
8
+ from sklearn.preprocessing import StandardScaler # type: ignore
9
+ from sklearn.model_selection import train_test_split # type: ignore
10
+ from sklearn.linear_model import LogisticRegression # type: ignore
11
+ from sklearn import metrics # type: ignore
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from followthemoney.util import PathLike
14
+
15
+ from nomenklatura.judgement import Judgement
16
+ from nomenklatura.matching.pairs import read_pairs, JudgedPair
17
+ from nomenklatura.matching.regression_v1.model import RegressionV1
18
+
19
+ log = logging.getLogger(__name__)
20
+
21
+
22
+ def pair_convert(pair: JudgedPair) -> Tuple[List[float], int]:
23
+ """Encode a pair of training data into features and target."""
24
+ judgement = 1 if pair.judgement == Judgement.POSITIVE else 0
25
+ features = RegressionV1.encode_pair(pair.left, pair.right)
26
+ return features, judgement
27
+
28
+
29
+ def pairs_to_arrays(
30
+ pairs: Iterable[JudgedPair],
31
+ ) -> Tuple[NDArray[np.float32], NDArray[np.float32]]:
32
+ """Parallelize feature computation for training data"""
33
+ xrows = []
34
+ yrows = []
35
+ threads = multiprocessing.cpu_count()
36
+ log.info("Compute threads: %d", threads)
37
+ with ThreadPoolExecutor(max_workers=threads) as excecutor:
38
+ results = excecutor.map(pair_convert, pairs)
39
+ for idx, (x, y) in enumerate(results):
40
+ if idx > 0 and idx % 10000 == 0:
41
+ log.info("Computing features: %s....", idx)
42
+ xrows.append(x)
43
+ yrows.append(y)
44
+
45
+ return np.array(xrows), np.array(yrows)
46
+
47
+
48
+ def train_matcher(pairs_file: PathLike) -> None:
49
+ pairs = []
50
+ for pair in read_pairs(pairs_file):
51
+ # HACK: support more eventually:
52
+ # if not pair.left.schema.is_a("LegalEntity"):
53
+ # continue
54
+ if pair.judgement == Judgement.UNSURE:
55
+ pair.judgement = Judgement.NEGATIVE
56
+ # randomize_entity(pair.left)
57
+ # randomize_entity(pair.right)
58
+ pairs.append(pair)
59
+ # random.shuffle(pairs)
60
+ # pairs = pairs[:30000]
61
+ positive = len([p for p in pairs if p.judgement == Judgement.POSITIVE])
62
+ negative = len([p for p in pairs if p.judgement == Judgement.NEGATIVE])
63
+ log.info("Total pairs loaded: %d (%d pos/%d neg)", len(pairs), positive, negative)
64
+ X, y = pairs_to_arrays(pairs)
65
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
66
+ # logreg = LogisticRegression(class_weight={0: 95, 1: 1})
67
+ # logreg = LogisticRegression(penalty="l1", solver="liblinear")
68
+ logreg = LogisticRegression(penalty="l2")
69
+ log.info("Training model...")
70
+ pipe = make_pipeline(StandardScaler(), logreg)
71
+ pipe.fit(X_train, y_train)
72
+ coef = logreg.coef_[0]
73
+ coefficients = {n.__name__: c for n, c in zip(RegressionV1.FEATURES, coef)}
74
+ RegressionV1.save(pipe, coefficients)
75
+ print("Written to: %s" % RegressionV1.MODEL_PATH.as_posix())
76
+ print("Coefficients:")
77
+ pprint(coefficients)
78
+ y_pred = pipe.predict(X_test)
79
+ cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
80
+ print("Confusion matrix:\n", cnf_matrix)
81
+ print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
82
+ print("Precision:", metrics.precision_score(y_test, y_pred))
83
+ print("Recall:", metrics.recall_score(y_test, y_pred))
84
+
85
+ y_pred_proba = pipe.predict_proba(X_test)[::, 1]
86
+ auc = metrics.roc_auc_score(y_test, y_pred_proba)
87
+ print("Area under curve:", auc)
@@ -0,0 +1,31 @@
1
+ from normality.constants import WS
2
+ from typing import Iterable, Set, Tuple
3
+ from rigour.text.distance import levenshtein
4
+
5
+ from nomenklatura.matching.compat import clean_name_ascii
6
+
7
+
8
+ def tokenize(texts: Iterable[str]) -> Set[str]:
9
+ tokens: Set[str] = set()
10
+ for text in texts:
11
+ cleaned = clean_name_ascii(text)
12
+ if cleaned is None:
13
+ continue
14
+ for token in cleaned.split(WS):
15
+ token = token.strip()
16
+ if len(token) > 2:
17
+ tokens.add(token)
18
+ return tokens
19
+
20
+
21
+ def tokenize_pair(
22
+ pair: Tuple[Iterable[str], Iterable[str]],
23
+ ) -> Tuple[Set[str], Set[str]]:
24
+ return tokenize(pair[0]), tokenize(pair[1])
25
+
26
+
27
+ def compare_levenshtein(left: str, right: str) -> float:
28
+ distance = levenshtein(left, right)
29
+ base = max((1, len(left), len(right)))
30
+ return 1.0 - (distance / float(base))
31
+ # return math.sqrt(distance)
@@ -0,0 +1,5 @@
1
+ """Support Vector Machine matching algorithm for nomenklatura."""
2
+
3
+ from .model import SVMV1
4
+
5
+ __all__ = ["SVMV1"]