nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,181 @@
1
+ import math
2
+ from functools import lru_cache
3
+ from collections import defaultdict
4
+ from itertools import zip_longest
5
+ from typing import Dict, List, Optional, Tuple
6
+ from rapidfuzz.distance import Levenshtein, Opcodes
7
+ from rigour.names import NamePart, is_stopword
8
+ from rigour.text.distance import levenshtein
9
+
10
+ from nomenklatura.matching.logic_v2.names.util import Match
11
+ from nomenklatura.matching.types import ScoringConfig
12
+ from nomenklatura.util import unroll
13
+
14
+ SEP = " "
15
+ SIMILAR_PAIRS = [
16
+ ("0", "o"),
17
+ ("1", "i"),
18
+ ("g", "9"),
19
+ ("q", "9"),
20
+ ("b", "6"),
21
+ ("5", "s"),
22
+ ("e", "i"),
23
+ ("1", "l"),
24
+ ("o", "u"),
25
+ ("i", "j"),
26
+ ("i", "y"),
27
+ ("c", "k"),
28
+ ("n", "h"),
29
+ ]
30
+ SIMILAR_PAIRS = SIMILAR_PAIRS + [(b, a) for a, b in SIMILAR_PAIRS]
31
+
32
+
33
+ @lru_cache(maxsize=512)
34
+ def strict_levenshtein(left: str, right: str, max_rate: int = 4) -> float:
35
+ """Calculate the string distance between two strings."""
36
+ if left == right:
37
+ return 1.0
38
+ max_len = max(len(left), len(right))
39
+ max_edits = max_len // max_rate
40
+ if max_edits < 1: # We already checked for equality
41
+ return 0.0
42
+ distance = levenshtein(left, right, max_edits=max_len)
43
+ if distance > max_edits:
44
+ return 0.0
45
+ return (1 - (distance / max_len)) ** max_edits
46
+
47
+
48
+ def _edit_cost(op: str, qc: Optional[str], rc: Optional[str]) -> float:
49
+ """Calculate the cost of a pair of characters."""
50
+ if op == "equal":
51
+ return 0.0
52
+ if qc == SEP and rc is None:
53
+ return 0.2
54
+ if rc == SEP and qc is None:
55
+ return 0.2
56
+ if (qc, rc) in SIMILAR_PAIRS:
57
+ return 0.7
58
+ if qc is not None and qc.isdigit():
59
+ return 1.5
60
+ if rc is not None and rc.isdigit():
61
+ return 1.5
62
+ return 1.0
63
+
64
+
65
+ def _costs_similarity(costs: List[float], max_cost_bias: float = 1.0) -> float:
66
+ """Calculate a similarity score based on a list of costs."""
67
+ if len(costs) == 0:
68
+ return 0.0
69
+ # max_cost defines how many edits we allow for a given length.
70
+ # We use a log here because for very long names, we don't want an anything goes
71
+ # policy for very long name strings (~hundreds of characters).
72
+ # The log-base is a bit of a magic number. We adjusted it so that for
73
+ # len 8 it allows ~2 edits. That seems reasonable, but is also entirely arbitrary.
74
+ # We use log(x-2) to disable fuzzy-matching completely for very short
75
+ # names (often Chinese names in practice).
76
+ max_cost = math.log(max(len(costs) - 2, 1), 2.35) * max_cost_bias
77
+ total_cost = sum(costs)
78
+ if total_cost == 0:
79
+ return 1.0
80
+ if total_cost > max_cost:
81
+ return 0.0
82
+ # Normalize the score to be between 0 and 1
83
+ return 1 - (total_cost / len(costs))
84
+
85
+
86
+ @lru_cache(maxsize=512)
87
+ def _opcodes(qry_text: str, res_text: str) -> Opcodes:
88
+ """Get the opcodes for the Levenshtein distance between two strings."""
89
+ return Levenshtein.opcodes(qry_text, res_text)
90
+
91
+
92
+ def weighted_edit_similarity(
93
+ qry_parts: List[NamePart], res_parts: List[NamePart], config: ScoringConfig
94
+ ) -> List[Match]:
95
+ """Calculate a weighted similarity score between two sets of name parts. This function implements custom
96
+ frills within the context of a simple Levenshtein distance calculation. For example:
97
+
98
+ * The result is returned as a list of Match objects, which contain a score, but also a weight.
99
+ * Removals of full tokens are penalized more lightly than intra-token edits.
100
+ * Some edits inside of words are considered more similar than others, e.g. "o" and "0".
101
+ """
102
+ if len(qry_parts) == 0 and len(res_parts) == 0:
103
+ return []
104
+ qry_text = SEP.join(p.comparable for p in qry_parts)
105
+ res_text = SEP.join(p.comparable for p in res_parts)
106
+
107
+ # Keep track of which name parts overlap and how many characters they share in the alignment
108
+ # produced by rapidfuzz Levenshtein opcodes.
109
+ overlaps: Dict[Tuple[NamePart, NamePart], int] = defaultdict(int)
110
+
111
+ # Keep track of the costs for each name part, so we can calculate a similarity score later.
112
+ costs: Dict[NamePart, List[float]] = defaultdict(list)
113
+
114
+ if len(qry_parts) and len(res_parts):
115
+ qry_cur = qry_parts[0]
116
+ res_cur = res_parts[0]
117
+ for op in _opcodes(qry_text, res_text):
118
+ qry_span = qry_text[op.src_start : op.src_end]
119
+ res_span = res_text[op.dest_start : op.dest_end]
120
+ for qc, rc in zip_longest(qry_span, res_span, fillvalue=None):
121
+ if op.tag == "equal":
122
+ if qc not in (None, SEP) and rc not in (None, SEP):
123
+ # TODO: should this also include "replace"?
124
+ overlaps[(qry_cur, res_cur)] += 1
125
+ cost = _edit_cost(op.tag, qc, rc)
126
+ if qc is not None:
127
+ costs[qry_cur].append(cost)
128
+ if qc == SEP:
129
+ next_idx = qry_parts.index(qry_cur) + 1
130
+ if len(qry_parts) >= next_idx:
131
+ qry_cur = qry_parts[next_idx]
132
+ if rc is not None:
133
+ costs[res_cur].append(cost)
134
+ if rc == SEP:
135
+ next_idx = res_parts.index(res_cur) + 1
136
+ if len(res_parts) >= next_idx:
137
+ res_cur = res_parts[next_idx]
138
+
139
+ # Use the overlaps to create matches between query and result parts.
140
+ part_matches: Dict[NamePart, Match] = {}
141
+ for (qp, rp), overlap in overlaps.items():
142
+ min_len = min(len(qp.comparable), len(rp.comparable))
143
+ if overlap / min_len > 0.51:
144
+ match = part_matches.get(qp, part_matches.get(rp, Match()))
145
+ if qp not in match.qps:
146
+ match.qps.append(qp)
147
+ if rp not in match.rps:
148
+ match.rps.append(rp)
149
+ part_matches[rp] = match
150
+ part_matches[qp] = match
151
+
152
+ # Compute the scores where an overlap was applied
153
+ bias = config.get_float("nm_fuzzy_cutoff_factor")
154
+ matches = set(part_matches.values())
155
+ for match in matches:
156
+ # Score down stopwords:
157
+ if len(match.qps) == 1 and len(match.rps) == 1:
158
+ if is_stopword(match.qps[0].form):
159
+ match.weight = 0.7
160
+
161
+ qcosts = unroll(costs.get(p, [1.0]) for p in match.qps)
162
+ rcosts = unroll(costs.get(p, [1.0]) for p in match.rps)
163
+ match.score = _costs_similarity(qcosts, max_cost_bias=bias) * _costs_similarity(
164
+ rcosts, max_cost_bias=bias
165
+ )
166
+
167
+ # Non-matched query parts: this penalizes scenarios where name parts in the query are
168
+ # not matched to any name part in the result. Increasing this penalty will require queries
169
+ # to always be matched in full.
170
+ for qp in qry_parts:
171
+ if qp not in part_matches:
172
+ match = Match(qps=[qp])
173
+ matches.add(match)
174
+
175
+ # Non-matched result parts
176
+ for rp in res_parts:
177
+ if rp not in part_matches:
178
+ match = Match(rps=[rp])
179
+ matches.add(match)
180
+
181
+ return list(matches)
@@ -0,0 +1,60 @@
1
+ from typing import List
2
+ from rigour.names import is_stopword
3
+ from rigour.names import Name, NamePart, Symbol
4
+
5
+
6
+ # Used when a match is two-sided (e.g. international~intl), to modify the importance of the match
7
+ # in the context of a set of matches.
8
+ SYM_WEIGHTS = {
9
+ Symbol.Category.ORG_CLASS: 0.7,
10
+ Symbol.Category.INITIAL: 0.5,
11
+ Symbol.Category.NICK: 0.8,
12
+ # in "A B International" and "X International", we don't want to give too much weight to the symbol
13
+ Symbol.Category.SYMBOL: 0.3,
14
+ # Vessel 1 vs. Vessel 2 are very different.
15
+ Symbol.Category.NUMERIC: 1.3,
16
+ Symbol.Category.LOCATION: 0.8,
17
+ }
18
+
19
+ # Used when a match is one-sided (e.g. "international" in the query but not the result), to modify
20
+ # the impact of the extra name part on the score.
21
+ # For the categories not listed here, we give a weight of 1.0 (see weight_extra_match below)
22
+ EXTRAS_WEIGHTS = {
23
+ # Siemens AG vs. Siemens, sometimes the org class is omitted
24
+ Symbol.Category.ORG_CLASS: 0.7,
25
+ Symbol.Category.SYMBOL: 0.7,
26
+ # PE Fund 1 vs. PE Fund, often investments funds are numbered and that's quite important
27
+ Symbol.Category.NUMERIC: 1.3,
28
+ # Siemens Russia vs. Siemens: we don't care that much because in a local context,
29
+ # it's common to omit the suffix of the local subsidiary.
30
+ Symbol.Category.LOCATION: 0.8,
31
+ }
32
+
33
+ SYM_SCORES = {
34
+ Symbol.Category.ORG_CLASS: 0.8,
35
+ Symbol.Category.INITIAL: 0.9,
36
+ Symbol.Category.NAME: 0.9,
37
+ Symbol.Category.NICK: 0.6,
38
+ Symbol.Category.SYMBOL: 0.9,
39
+ Symbol.Category.NUMERIC: 0.9,
40
+ Symbol.Category.LOCATION: 0.9,
41
+ }
42
+
43
+
44
+ def weight_extra_match(parts: List[NamePart], name: Name) -> float:
45
+ """Apply a weight to a name part which remained unmatched in the system, as a function
46
+ of a user-supplied penalty, symbol weights, and some overrides."""
47
+ if len(parts) == 1 and is_stopword(parts[0].form):
48
+ return 0.5
49
+ sparts = hash(tuple(parts))
50
+ weight = 1.0
51
+ categories = set()
52
+ for span in name.spans:
53
+ if span.symbol.category == Symbol.Category.NUMERIC:
54
+ part = span.parts[0]
55
+ if len(span.parts) == 1 and not part.numeric and len(part.comparable) < 2:
56
+ continue
57
+ if sparts == hash(tuple(span.parts)):
58
+ categories.add(span.symbol.category)
59
+ weight = weight * EXTRAS_WEIGHTS.get(span.symbol.category, 1.0)
60
+ return weight
@@ -0,0 +1,195 @@
1
+ from typing import Dict, List, Set
2
+ from rigour.names import NameTypeTag, Name, NamePart, Span, Symbol
3
+ from rigour.names import align_person_name_order, normalize_name
4
+ from rigour.names import remove_obj_prefixes
5
+ from followthemoney.proxy import E, EntityProxy
6
+ from followthemoney import model
7
+ from followthemoney.types import registry
8
+ from followthemoney.names import schema_type_tag
9
+
10
+ from nomenklatura.matching.logic_v2.names.analysis import entity_names
11
+ from nomenklatura.matching.logic_v2.names.magic import weight_extra_match
12
+ from nomenklatura.matching.logic_v2.names.pairing import Pairing
13
+ from nomenklatura.matching.logic_v2.names.distance import weighted_edit_similarity
14
+ from nomenklatura.matching.logic_v2.names.distance import strict_levenshtein
15
+ from nomenklatura.matching.logic_v2.names.util import Match, numbers_mismatch
16
+ from nomenklatura.matching.types import FtResult, ScoringConfig
17
+
18
+
19
+ # Step 1: Generate all Matches based on symbols
20
+ # Step 2: Generate the most highly-scored sequences of matches
21
+ # Step 3: Pick the best sequence
22
+
23
+
24
+ def match_name_symbolic(query: Name, result: Name, config: ScoringConfig) -> FtResult:
25
+ # Stage 1: We create a set of pairings between the symbols that have been annotated as spans
26
+ # on both names. This will try to determine the maximum, non-overlapping set of name
27
+ # parts that can be explained using pre-defined symbols.
28
+ query_symbols: Set[Symbol] = set(span.symbol for span in query.spans)
29
+ pairings = [Pairing.empty()]
30
+ result_map: Dict[Symbol, List[Span]] = {}
31
+ for span in result.spans:
32
+ if span.symbol not in query_symbols:
33
+ continue
34
+ if span.symbol not in result_map:
35
+ result_map[span.symbol] = []
36
+ result_map[span.symbol].append(span)
37
+ seen: Set[int] = set()
38
+ for part in query.parts:
39
+ next_pairings: List[Pairing] = []
40
+ for qspan in query.spans:
41
+ if qspan.symbol not in result_map:
42
+ continue
43
+ if part not in qspan.parts:
44
+ continue
45
+ for rspan in result_map.get(qspan.symbol, []):
46
+ # This assumes that these are the only factors for weighting the
47
+ # resulting match:
48
+ key = hash((qspan.parts, rspan.parts, qspan.symbol.category))
49
+ if key in seen:
50
+ continue
51
+ for pairing in pairings:
52
+ if pairing.can_pair(qspan, rspan):
53
+ seen.add(key)
54
+ next_pairing = pairing.add(qspan, rspan)
55
+ next_pairings.append(next_pairing)
56
+ if len(next_pairings):
57
+ pairings = next_pairings
58
+
59
+ # Stage 2: We compute the score for each pairing, which is a combination of the
60
+ # symbolic match (some types of symbols are considered less strong matches than others) and
61
+ # the fuzzy match of the remaining name parts. Special scoring is also applied for extra
62
+ # name parts that are not matched to the other name during name alignment.
63
+ extra_query_weight = config.get_float("nm_extra_query_name")
64
+ extra_result_weight = config.get_float("nm_extra_result_name")
65
+ family_name_weight = config.get_float("nm_family_name_weight")
66
+ retval = FtResult(score=0.0, detail=None)
67
+ for pairing in pairings:
68
+ matches: List[Match] = pairing.matches
69
+
70
+ # Name parts that have not been tagged with a symbol:
71
+ query_rem = [part for part in query.parts if part not in pairing.query_used]
72
+ result_rem = [part for part in result.parts if part not in pairing.result_used]
73
+
74
+ if len(query_rem) > 0 or len(result_rem) > 0:
75
+ if query.tag == NameTypeTag.PER:
76
+ query_rem, result_rem = align_person_name_order(query_rem, result_rem)
77
+ else:
78
+ query_rem = NamePart.tag_sort(query_rem)
79
+ result_rem = NamePart.tag_sort(result_rem)
80
+
81
+ matches.extend(weighted_edit_similarity(query_rem, result_rem, config))
82
+
83
+ # Apply additional weight and score normalisation to the generated matches based
84
+ # on contextual clues.
85
+ for match in matches:
86
+ # Matches with one side empty, i.e. unmatched parts
87
+ # unmatched result part
88
+ if len(match.qps) == 0:
89
+ bias = weight_extra_match(match.rps, result)
90
+ match.weight = extra_result_weight * bias
91
+ # unmatched query part
92
+ elif len(match.rps) == 0:
93
+ bias = weight_extra_match(match.qps, query)
94
+ match.weight = extra_query_weight * bias
95
+ # We fall through here to apply the family-name boost to unmatched parts too.
96
+
97
+ # We have types of symbol matches and where we never score 1.0, but for
98
+ # literal matches, we always want to score 1.0
99
+ if match.score < 1.0 and match.qstr == match.rstr:
100
+ match.score = 1.0
101
+ # We treat family names matches as more important (but configurable) because
102
+ # they're just globally less murky and changeable than given names.
103
+ if match.is_family_name():
104
+ match.weight *= family_name_weight
105
+
106
+
107
+ # Sum up and average all the weights to get the final score for this pairing.
108
+ # score = sum(weights) / len(weights) if len(weights) > 0 else 0.0
109
+ total_weight = sum(match.weight for match in matches)
110
+ total_score = sum(match.weighted_score for match in matches)
111
+ score = total_score / total_weight if total_weight > 0 else 0.0
112
+ if score > retval.score:
113
+ detail = " ".join(str(m) for m in matches)
114
+ retval = FtResult(score=score, detail=detail)
115
+ if retval.detail is None:
116
+ retval.detail = f"{query.comparable!r}≉{result.comparable!r}"
117
+ return retval
118
+
119
+
120
+ def _get_object_names(entity: EntityProxy) -> Set[str]:
121
+ """Get the names of an object entity, such as a vessel or asset."""
122
+ names = entity.get_type_values(registry.name, matchable=True)
123
+ if not names:
124
+ return set()
125
+ normalized = [normalize_name(name) for name in names]
126
+ return set([n for n in normalized if n is not None])
127
+
128
+
129
+ def match_object_names(query: E, result: E, config: ScoringConfig) -> FtResult:
130
+ """Match the names of two objects, such as vessels or assets."""
131
+ result_names = _get_object_names(result)
132
+ mismatch_penalty = 1 - config.get_float("nm_number_mismatch")
133
+ best_result = FtResult(score=0.0, detail=None)
134
+ for query_name in _get_object_names(query):
135
+ query_name = remove_obj_prefixes(query_name)
136
+ for result_name in result_names:
137
+ result_name = remove_obj_prefixes(result_name)
138
+ score = strict_levenshtein(query_name, result_name, max_rate=5)
139
+ if score == 1.0:
140
+ detail = f"[{result_name!r} literalMatch]"
141
+ else:
142
+ detail = f"[{query_name!r}≈{result_name!r}, fuzzyMatch: {score:.2f}]"
143
+ if numbers_mismatch(query_name, result_name):
144
+ score = score * mismatch_penalty
145
+ detail = "Number mismatch"
146
+ if score > best_result.score:
147
+ best_result = FtResult(score=score, detail=detail)
148
+ return best_result
149
+
150
+
151
+ def name_match(query: E, result: E, config: ScoringConfig) -> FtResult:
152
+ """Match two entities by analyzing and comparing their names."""
153
+ schema = model.common_schema(query.schema, result.schema)
154
+ type_tag = schema_type_tag(schema)
155
+ best = FtResult(score=0.0, detail=None)
156
+ if type_tag == NameTypeTag.UNK:
157
+ # Name matching is not supported for entities that are not listed
158
+ # as a person, organization, or a thing.
159
+ best.detail = "Unsuited for name matching: %s" % schema.name
160
+ return best
161
+ if type_tag == NameTypeTag.OBJ:
162
+ return match_object_names(query, result, config)
163
+ query_names = entity_names(type_tag, query, is_query=True)
164
+ result_names = entity_names(type_tag, result)
165
+
166
+ # For literal matches, return early instead of performing all the magic. This addresses
167
+ # a user surprise where literal matches can score below 1.0 after name de-duplication has
168
+ # only left a superset name on one side.
169
+ query_comparable = {name.comparable: name for name in query_names}
170
+ result_comparable = {name.comparable: name for name in result_names}
171
+ common = set(query_comparable).intersection(result_comparable)
172
+ if len(common) > 0:
173
+ longest = max(common, key=len)
174
+ match = Match(
175
+ qps=query_comparable[longest].parts,
176
+ rps=result_comparable[longest].parts,
177
+ score=1.0,
178
+ )
179
+ return FtResult(score=match.score, detail=str(match))
180
+
181
+ # Remove short names that are contained in longer names.
182
+ # This prevents a scenario where a short version of a name ("John
183
+ # Smith") is matched to a query ("John K Smith"), where a longer version
184
+ # ("John K Smith" != "John R Smith") would have disqualified the match.
185
+ query_names = Name.consolidate_names(query_names)
186
+ result_names = Name.consolidate_names(result_names)
187
+
188
+ for query_name in query_names:
189
+ for result_name in result_names:
190
+ ftres = match_name_symbolic(query_name, result_name, config)
191
+ if ftres.score >= best.score:
192
+ best = ftres
193
+ if best.detail is None:
194
+ best.detail = "No names available for matching"
195
+ return best
@@ -0,0 +1,81 @@
1
+ from typing import List, Set
2
+
3
+ from rigour.names import NamePart, Symbol, Span
4
+
5
+ from nomenklatura.matching.logic_v2.names.magic import SYM_SCORES, SYM_WEIGHTS
6
+ from nomenklatura.matching.logic_v2.names.util import Match
7
+
8
+
9
+ class Pairing:
10
+ __slots__ = [
11
+ "query_used",
12
+ "result_used",
13
+ "matches",
14
+ "_hash",
15
+ ]
16
+
17
+ def __init__(
18
+ self,
19
+ query_used: Set[NamePart],
20
+ result_used: Set[NamePart],
21
+ matches: List[Match],
22
+ ) -> None:
23
+ self.query_used = query_used
24
+ self.result_used = result_used
25
+ self.matches = matches
26
+
27
+ @classmethod
28
+ def empty(cls) -> "Pairing":
29
+ """Create a new pairing with no matches."""
30
+ return cls(
31
+ query_used=set(),
32
+ result_used=set(),
33
+ matches=[],
34
+ )
35
+
36
+ def can_pair(self, query_span: Span, result_span: Span) -> bool:
37
+ """Check if two spans can be paired."""
38
+ if self.query_used.intersection(query_span.parts):
39
+ return False
40
+ if self.result_used.intersection(result_span.parts):
41
+ return False
42
+
43
+ # If the text is actually identical, we do not need to establish
44
+ # a pairing, as it is already a match.
45
+ # revised: This doesn't work because it knocks out the stopword-like functionality of
46
+ # symbolic matching.
47
+ # if query_span.form == result_span.form:
48
+ # return False
49
+
50
+ # Check if one at least of the two span parts is a name initial
51
+ if query_span.symbol.category == Symbol.Category.INITIAL:
52
+ if len(query_span.parts[0]) > 1 and len(result_span.parts[0]) > 1:
53
+ return False
54
+
55
+ if query_span.symbol.category in (Symbol.Category.NAME, Symbol.Category.NICK):
56
+ # This may not be correct for many tokens since it's expected that the bulk
57
+ # of filtered items will be single-part span.
58
+ for qp, rp in zip(query_span.parts, result_span.parts):
59
+ if not qp.can_match(rp):
60
+ return False
61
+
62
+ return True
63
+
64
+ def add(self, query_span: Span, result_span: Span) -> "Pairing":
65
+ """Add a pair of spans to the pairing."""
66
+ symbol = query_span.symbol
67
+ # Some types of symbols effectively also work as soft stopwords, reducing the relevance
68
+ # of the match. For example, "Ltd." in an organization name is not as informative as a
69
+ # person's first name. That's why we're assigning a low weight, even for literal matches.
70
+ match = Match(
71
+ qps=query_span.parts,
72
+ rps=result_span.parts,
73
+ symbol=query_span.symbol,
74
+ score=SYM_SCORES.get(symbol.category, 1.0),
75
+ weight=SYM_WEIGHTS.get(symbol.category, 1.0),
76
+ )
77
+ return Pairing(
78
+ self.query_used.union(query_span.parts),
79
+ self.result_used.union(result_span.parts),
80
+ self.matches + [match],
81
+ )
@@ -0,0 +1,89 @@
1
+ import re
2
+
3
+ from typing import Optional, Any, Sequence
4
+ from rigour.names import NamePart, Symbol, NamePartTag
5
+
6
+
7
+ class Match:
8
+ """A Match combines query and result name parts, along with a score and weight. It is one
9
+ part of the matching result, which is eventually aggregated into a final score."""
10
+
11
+ __slots__ = ["qps", "rps", "symbol", "score", "weight"]
12
+
13
+ def __init__(
14
+ self,
15
+ qps: Sequence[NamePart] = [],
16
+ rps: Sequence[NamePart] = [],
17
+ symbol: Optional[Symbol] = None,
18
+ score: float = 0.0,
19
+ weight: float = 1.0,
20
+ ) -> None:
21
+ """Initialize the Match object with query and result parts."""
22
+ self.qps = list(qps)
23
+ self.rps = list(rps)
24
+ self.symbol: Optional[Symbol] = symbol
25
+ self.score = score
26
+ self.weight = weight
27
+
28
+ @property
29
+ def weighted_score(self) -> float:
30
+ """Calculate the weighted score."""
31
+ return self.score * self.weight
32
+
33
+ @property
34
+ def qstr(self) -> str:
35
+ """Get the query string representation."""
36
+ return " ".join([part.comparable for part in self.qps])
37
+
38
+ @property
39
+ def rstr(self) -> str:
40
+ """Get the result string representation."""
41
+ return " ".join([part.comparable for part in self.rps])
42
+
43
+ def is_family_name(self) -> bool:
44
+ """Check if the match represents a family name."""
45
+ for np in self.qps:
46
+ if np.tag == NamePartTag.FAMILY:
47
+ return True
48
+ for np in self.rps:
49
+ if np.tag == NamePartTag.FAMILY:
50
+ return True
51
+ return False
52
+
53
+ def __hash__(self) -> int:
54
+ """Hash the Match object based on query and result parts."""
55
+ return hash((self.symbol, tuple(self.qps), tuple(self.rps)))
56
+
57
+ def __eq__(self, other: Any) -> bool:
58
+ """Check equality of two Match objects based on query and result parts."""
59
+ return hash(self) == hash(other)
60
+
61
+ def __repr__(self) -> str:
62
+ """String representation of the Match object."""
63
+ return f"<Match({str(self)})>"
64
+
65
+ def __str__(self) -> str:
66
+ """String representation of the Match object for debugging."""
67
+ qps_str = self.qstr
68
+ rps_str = self.rstr
69
+ if self.symbol is not None:
70
+ explanation = f"{qps_str!r}≈{rps_str!r} symbolMatch {self.symbol}"
71
+ elif not len(qps_str):
72
+ explanation = f"{rps_str!r} extraResultPart"
73
+ elif not len(rps_str):
74
+ explanation = f"{qps_str!r} extraQueryPart"
75
+ elif qps_str == rps_str:
76
+ explanation = f"{rps_str!r} literalMatch"
77
+ else:
78
+ explanation = f"{qps_str!r}≈{rps_str!r} fuzzyMatch"
79
+ return f"[{explanation}: {self.score:.2f}, weight {self.weight:.2f}]"
80
+
81
+
82
+ NUMERIC = re.compile(r"\d{1,}")
83
+
84
+
85
+ def numbers_mismatch(query: str, result: str) -> bool:
86
+ """Check if the number of numerals in two names is different."""
87
+ query_nums = set(NUMERIC.findall(query))
88
+ result_nums = set(NUMERIC.findall(result))
89
+ return len(query_nums.difference(result_nums)) > 0
@@ -0,0 +1,4 @@
1
+ from nomenklatura.matching.name_based.model import NameMatcher
2
+ from nomenklatura.matching.name_based.model import NameQualifiedMatcher
3
+
4
+ __all__ = ["NameMatcher", "NameQualifiedMatcher"]