nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,181 @@
|
|
1
|
+
import math
|
2
|
+
from functools import lru_cache
|
3
|
+
from collections import defaultdict
|
4
|
+
from itertools import zip_longest
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
6
|
+
from rapidfuzz.distance import Levenshtein, Opcodes
|
7
|
+
from rigour.names import NamePart, is_stopword
|
8
|
+
from rigour.text.distance import levenshtein
|
9
|
+
|
10
|
+
from nomenklatura.matching.logic_v2.names.util import Match
|
11
|
+
from nomenklatura.matching.types import ScoringConfig
|
12
|
+
from nomenklatura.util import unroll
|
13
|
+
|
14
|
+
SEP = " "
|
15
|
+
SIMILAR_PAIRS = [
|
16
|
+
("0", "o"),
|
17
|
+
("1", "i"),
|
18
|
+
("g", "9"),
|
19
|
+
("q", "9"),
|
20
|
+
("b", "6"),
|
21
|
+
("5", "s"),
|
22
|
+
("e", "i"),
|
23
|
+
("1", "l"),
|
24
|
+
("o", "u"),
|
25
|
+
("i", "j"),
|
26
|
+
("i", "y"),
|
27
|
+
("c", "k"),
|
28
|
+
("n", "h"),
|
29
|
+
]
|
30
|
+
SIMILAR_PAIRS = SIMILAR_PAIRS + [(b, a) for a, b in SIMILAR_PAIRS]
|
31
|
+
|
32
|
+
|
33
|
+
@lru_cache(maxsize=512)
|
34
|
+
def strict_levenshtein(left: str, right: str, max_rate: int = 4) -> float:
|
35
|
+
"""Calculate the string distance between two strings."""
|
36
|
+
if left == right:
|
37
|
+
return 1.0
|
38
|
+
max_len = max(len(left), len(right))
|
39
|
+
max_edits = max_len // max_rate
|
40
|
+
if max_edits < 1: # We already checked for equality
|
41
|
+
return 0.0
|
42
|
+
distance = levenshtein(left, right, max_edits=max_len)
|
43
|
+
if distance > max_edits:
|
44
|
+
return 0.0
|
45
|
+
return (1 - (distance / max_len)) ** max_edits
|
46
|
+
|
47
|
+
|
48
|
+
def _edit_cost(op: str, qc: Optional[str], rc: Optional[str]) -> float:
|
49
|
+
"""Calculate the cost of a pair of characters."""
|
50
|
+
if op == "equal":
|
51
|
+
return 0.0
|
52
|
+
if qc == SEP and rc is None:
|
53
|
+
return 0.2
|
54
|
+
if rc == SEP and qc is None:
|
55
|
+
return 0.2
|
56
|
+
if (qc, rc) in SIMILAR_PAIRS:
|
57
|
+
return 0.7
|
58
|
+
if qc is not None and qc.isdigit():
|
59
|
+
return 1.5
|
60
|
+
if rc is not None and rc.isdigit():
|
61
|
+
return 1.5
|
62
|
+
return 1.0
|
63
|
+
|
64
|
+
|
65
|
+
def _costs_similarity(costs: List[float], max_cost_bias: float = 1.0) -> float:
|
66
|
+
"""Calculate a similarity score based on a list of costs."""
|
67
|
+
if len(costs) == 0:
|
68
|
+
return 0.0
|
69
|
+
# max_cost defines how many edits we allow for a given length.
|
70
|
+
# We use a log here because for very long names, we don't want an anything goes
|
71
|
+
# policy for very long name strings (~hundreds of characters).
|
72
|
+
# The log-base is a bit of a magic number. We adjusted it so that for
|
73
|
+
# len 8 it allows ~2 edits. That seems reasonable, but is also entirely arbitrary.
|
74
|
+
# We use log(x-2) to disable fuzzy-matching completely for very short
|
75
|
+
# names (often Chinese names in practice).
|
76
|
+
max_cost = math.log(max(len(costs) - 2, 1), 2.35) * max_cost_bias
|
77
|
+
total_cost = sum(costs)
|
78
|
+
if total_cost == 0:
|
79
|
+
return 1.0
|
80
|
+
if total_cost > max_cost:
|
81
|
+
return 0.0
|
82
|
+
# Normalize the score to be between 0 and 1
|
83
|
+
return 1 - (total_cost / len(costs))
|
84
|
+
|
85
|
+
|
86
|
+
@lru_cache(maxsize=512)
|
87
|
+
def _opcodes(qry_text: str, res_text: str) -> Opcodes:
|
88
|
+
"""Get the opcodes for the Levenshtein distance between two strings."""
|
89
|
+
return Levenshtein.opcodes(qry_text, res_text)
|
90
|
+
|
91
|
+
|
92
|
+
def weighted_edit_similarity(
|
93
|
+
qry_parts: List[NamePart], res_parts: List[NamePart], config: ScoringConfig
|
94
|
+
) -> List[Match]:
|
95
|
+
"""Calculate a weighted similarity score between two sets of name parts. This function implements custom
|
96
|
+
frills within the context of a simple Levenshtein distance calculation. For example:
|
97
|
+
|
98
|
+
* The result is returned as a list of Match objects, which contain a score, but also a weight.
|
99
|
+
* Removals of full tokens are penalized more lightly than intra-token edits.
|
100
|
+
* Some edits inside of words are considered more similar than others, e.g. "o" and "0".
|
101
|
+
"""
|
102
|
+
if len(qry_parts) == 0 and len(res_parts) == 0:
|
103
|
+
return []
|
104
|
+
qry_text = SEP.join(p.comparable for p in qry_parts)
|
105
|
+
res_text = SEP.join(p.comparable for p in res_parts)
|
106
|
+
|
107
|
+
# Keep track of which name parts overlap and how many characters they share in the alignment
|
108
|
+
# produced by rapidfuzz Levenshtein opcodes.
|
109
|
+
overlaps: Dict[Tuple[NamePart, NamePart], int] = defaultdict(int)
|
110
|
+
|
111
|
+
# Keep track of the costs for each name part, so we can calculate a similarity score later.
|
112
|
+
costs: Dict[NamePart, List[float]] = defaultdict(list)
|
113
|
+
|
114
|
+
if len(qry_parts) and len(res_parts):
|
115
|
+
qry_cur = qry_parts[0]
|
116
|
+
res_cur = res_parts[0]
|
117
|
+
for op in _opcodes(qry_text, res_text):
|
118
|
+
qry_span = qry_text[op.src_start : op.src_end]
|
119
|
+
res_span = res_text[op.dest_start : op.dest_end]
|
120
|
+
for qc, rc in zip_longest(qry_span, res_span, fillvalue=None):
|
121
|
+
if op.tag == "equal":
|
122
|
+
if qc not in (None, SEP) and rc not in (None, SEP):
|
123
|
+
# TODO: should this also include "replace"?
|
124
|
+
overlaps[(qry_cur, res_cur)] += 1
|
125
|
+
cost = _edit_cost(op.tag, qc, rc)
|
126
|
+
if qc is not None:
|
127
|
+
costs[qry_cur].append(cost)
|
128
|
+
if qc == SEP:
|
129
|
+
next_idx = qry_parts.index(qry_cur) + 1
|
130
|
+
if len(qry_parts) >= next_idx:
|
131
|
+
qry_cur = qry_parts[next_idx]
|
132
|
+
if rc is not None:
|
133
|
+
costs[res_cur].append(cost)
|
134
|
+
if rc == SEP:
|
135
|
+
next_idx = res_parts.index(res_cur) + 1
|
136
|
+
if len(res_parts) >= next_idx:
|
137
|
+
res_cur = res_parts[next_idx]
|
138
|
+
|
139
|
+
# Use the overlaps to create matches between query and result parts.
|
140
|
+
part_matches: Dict[NamePart, Match] = {}
|
141
|
+
for (qp, rp), overlap in overlaps.items():
|
142
|
+
min_len = min(len(qp.comparable), len(rp.comparable))
|
143
|
+
if overlap / min_len > 0.51:
|
144
|
+
match = part_matches.get(qp, part_matches.get(rp, Match()))
|
145
|
+
if qp not in match.qps:
|
146
|
+
match.qps.append(qp)
|
147
|
+
if rp not in match.rps:
|
148
|
+
match.rps.append(rp)
|
149
|
+
part_matches[rp] = match
|
150
|
+
part_matches[qp] = match
|
151
|
+
|
152
|
+
# Compute the scores where an overlap was applied
|
153
|
+
bias = config.get_float("nm_fuzzy_cutoff_factor")
|
154
|
+
matches = set(part_matches.values())
|
155
|
+
for match in matches:
|
156
|
+
# Score down stopwords:
|
157
|
+
if len(match.qps) == 1 and len(match.rps) == 1:
|
158
|
+
if is_stopword(match.qps[0].form):
|
159
|
+
match.weight = 0.7
|
160
|
+
|
161
|
+
qcosts = unroll(costs.get(p, [1.0]) for p in match.qps)
|
162
|
+
rcosts = unroll(costs.get(p, [1.0]) for p in match.rps)
|
163
|
+
match.score = _costs_similarity(qcosts, max_cost_bias=bias) * _costs_similarity(
|
164
|
+
rcosts, max_cost_bias=bias
|
165
|
+
)
|
166
|
+
|
167
|
+
# Non-matched query parts: this penalizes scenarios where name parts in the query are
|
168
|
+
# not matched to any name part in the result. Increasing this penalty will require queries
|
169
|
+
# to always be matched in full.
|
170
|
+
for qp in qry_parts:
|
171
|
+
if qp not in part_matches:
|
172
|
+
match = Match(qps=[qp])
|
173
|
+
matches.add(match)
|
174
|
+
|
175
|
+
# Non-matched result parts
|
176
|
+
for rp in res_parts:
|
177
|
+
if rp not in part_matches:
|
178
|
+
match = Match(rps=[rp])
|
179
|
+
matches.add(match)
|
180
|
+
|
181
|
+
return list(matches)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
from typing import List
|
2
|
+
from rigour.names import is_stopword
|
3
|
+
from rigour.names import Name, NamePart, Symbol
|
4
|
+
|
5
|
+
|
6
|
+
# Used when a match is two-sided (e.g. international~intl), to modify the importance of the match
|
7
|
+
# in the context of a set of matches.
|
8
|
+
SYM_WEIGHTS = {
|
9
|
+
Symbol.Category.ORG_CLASS: 0.7,
|
10
|
+
Symbol.Category.INITIAL: 0.5,
|
11
|
+
Symbol.Category.NICK: 0.8,
|
12
|
+
# in "A B International" and "X International", we don't want to give too much weight to the symbol
|
13
|
+
Symbol.Category.SYMBOL: 0.3,
|
14
|
+
# Vessel 1 vs. Vessel 2 are very different.
|
15
|
+
Symbol.Category.NUMERIC: 1.3,
|
16
|
+
Symbol.Category.LOCATION: 0.8,
|
17
|
+
}
|
18
|
+
|
19
|
+
# Used when a match is one-sided (e.g. "international" in the query but not the result), to modify
|
20
|
+
# the impact of the extra name part on the score.
|
21
|
+
# For the categories not listed here, we give a weight of 1.0 (see weight_extra_match below)
|
22
|
+
EXTRAS_WEIGHTS = {
|
23
|
+
# Siemens AG vs. Siemens, sometimes the org class is omitted
|
24
|
+
Symbol.Category.ORG_CLASS: 0.7,
|
25
|
+
Symbol.Category.SYMBOL: 0.7,
|
26
|
+
# PE Fund 1 vs. PE Fund, often investments funds are numbered and that's quite important
|
27
|
+
Symbol.Category.NUMERIC: 1.3,
|
28
|
+
# Siemens Russia vs. Siemens: we don't care that much because in a local context,
|
29
|
+
# it's common to omit the suffix of the local subsidiary.
|
30
|
+
Symbol.Category.LOCATION: 0.8,
|
31
|
+
}
|
32
|
+
|
33
|
+
SYM_SCORES = {
|
34
|
+
Symbol.Category.ORG_CLASS: 0.8,
|
35
|
+
Symbol.Category.INITIAL: 0.9,
|
36
|
+
Symbol.Category.NAME: 0.9,
|
37
|
+
Symbol.Category.NICK: 0.6,
|
38
|
+
Symbol.Category.SYMBOL: 0.9,
|
39
|
+
Symbol.Category.NUMERIC: 0.9,
|
40
|
+
Symbol.Category.LOCATION: 0.9,
|
41
|
+
}
|
42
|
+
|
43
|
+
|
44
|
+
def weight_extra_match(parts: List[NamePart], name: Name) -> float:
|
45
|
+
"""Apply a weight to a name part which remained unmatched in the system, as a function
|
46
|
+
of a user-supplied penalty, symbol weights, and some overrides."""
|
47
|
+
if len(parts) == 1 and is_stopword(parts[0].form):
|
48
|
+
return 0.5
|
49
|
+
sparts = hash(tuple(parts))
|
50
|
+
weight = 1.0
|
51
|
+
categories = set()
|
52
|
+
for span in name.spans:
|
53
|
+
if span.symbol.category == Symbol.Category.NUMERIC:
|
54
|
+
part = span.parts[0]
|
55
|
+
if len(span.parts) == 1 and not part.numeric and len(part.comparable) < 2:
|
56
|
+
continue
|
57
|
+
if sparts == hash(tuple(span.parts)):
|
58
|
+
categories.add(span.symbol.category)
|
59
|
+
weight = weight * EXTRAS_WEIGHTS.get(span.symbol.category, 1.0)
|
60
|
+
return weight
|
@@ -0,0 +1,195 @@
|
|
1
|
+
from typing import Dict, List, Set
|
2
|
+
from rigour.names import NameTypeTag, Name, NamePart, Span, Symbol
|
3
|
+
from rigour.names import align_person_name_order, normalize_name
|
4
|
+
from rigour.names import remove_obj_prefixes
|
5
|
+
from followthemoney.proxy import E, EntityProxy
|
6
|
+
from followthemoney import model
|
7
|
+
from followthemoney.types import registry
|
8
|
+
from followthemoney.names import schema_type_tag
|
9
|
+
|
10
|
+
from nomenklatura.matching.logic_v2.names.analysis import entity_names
|
11
|
+
from nomenklatura.matching.logic_v2.names.magic import weight_extra_match
|
12
|
+
from nomenklatura.matching.logic_v2.names.pairing import Pairing
|
13
|
+
from nomenklatura.matching.logic_v2.names.distance import weighted_edit_similarity
|
14
|
+
from nomenklatura.matching.logic_v2.names.distance import strict_levenshtein
|
15
|
+
from nomenklatura.matching.logic_v2.names.util import Match, numbers_mismatch
|
16
|
+
from nomenklatura.matching.types import FtResult, ScoringConfig
|
17
|
+
|
18
|
+
|
19
|
+
# Step 1: Generate all Matches based on symbols
|
20
|
+
# Step 2: Generate the most highly-scored sequences of matches
|
21
|
+
# Step 3: Pick the best sequence
|
22
|
+
|
23
|
+
|
24
|
+
def match_name_symbolic(query: Name, result: Name, config: ScoringConfig) -> FtResult:
|
25
|
+
# Stage 1: We create a set of pairings between the symbols that have been annotated as spans
|
26
|
+
# on both names. This will try to determine the maximum, non-overlapping set of name
|
27
|
+
# parts that can be explained using pre-defined symbols.
|
28
|
+
query_symbols: Set[Symbol] = set(span.symbol for span in query.spans)
|
29
|
+
pairings = [Pairing.empty()]
|
30
|
+
result_map: Dict[Symbol, List[Span]] = {}
|
31
|
+
for span in result.spans:
|
32
|
+
if span.symbol not in query_symbols:
|
33
|
+
continue
|
34
|
+
if span.symbol not in result_map:
|
35
|
+
result_map[span.symbol] = []
|
36
|
+
result_map[span.symbol].append(span)
|
37
|
+
seen: Set[int] = set()
|
38
|
+
for part in query.parts:
|
39
|
+
next_pairings: List[Pairing] = []
|
40
|
+
for qspan in query.spans:
|
41
|
+
if qspan.symbol not in result_map:
|
42
|
+
continue
|
43
|
+
if part not in qspan.parts:
|
44
|
+
continue
|
45
|
+
for rspan in result_map.get(qspan.symbol, []):
|
46
|
+
# This assumes that these are the only factors for weighting the
|
47
|
+
# resulting match:
|
48
|
+
key = hash((qspan.parts, rspan.parts, qspan.symbol.category))
|
49
|
+
if key in seen:
|
50
|
+
continue
|
51
|
+
for pairing in pairings:
|
52
|
+
if pairing.can_pair(qspan, rspan):
|
53
|
+
seen.add(key)
|
54
|
+
next_pairing = pairing.add(qspan, rspan)
|
55
|
+
next_pairings.append(next_pairing)
|
56
|
+
if len(next_pairings):
|
57
|
+
pairings = next_pairings
|
58
|
+
|
59
|
+
# Stage 2: We compute the score for each pairing, which is a combination of the
|
60
|
+
# symbolic match (some types of symbols are considered less strong matches than others) and
|
61
|
+
# the fuzzy match of the remaining name parts. Special scoring is also applied for extra
|
62
|
+
# name parts that are not matched to the other name during name alignment.
|
63
|
+
extra_query_weight = config.get_float("nm_extra_query_name")
|
64
|
+
extra_result_weight = config.get_float("nm_extra_result_name")
|
65
|
+
family_name_weight = config.get_float("nm_family_name_weight")
|
66
|
+
retval = FtResult(score=0.0, detail=None)
|
67
|
+
for pairing in pairings:
|
68
|
+
matches: List[Match] = pairing.matches
|
69
|
+
|
70
|
+
# Name parts that have not been tagged with a symbol:
|
71
|
+
query_rem = [part for part in query.parts if part not in pairing.query_used]
|
72
|
+
result_rem = [part for part in result.parts if part not in pairing.result_used]
|
73
|
+
|
74
|
+
if len(query_rem) > 0 or len(result_rem) > 0:
|
75
|
+
if query.tag == NameTypeTag.PER:
|
76
|
+
query_rem, result_rem = align_person_name_order(query_rem, result_rem)
|
77
|
+
else:
|
78
|
+
query_rem = NamePart.tag_sort(query_rem)
|
79
|
+
result_rem = NamePart.tag_sort(result_rem)
|
80
|
+
|
81
|
+
matches.extend(weighted_edit_similarity(query_rem, result_rem, config))
|
82
|
+
|
83
|
+
# Apply additional weight and score normalisation to the generated matches based
|
84
|
+
# on contextual clues.
|
85
|
+
for match in matches:
|
86
|
+
# Matches with one side empty, i.e. unmatched parts
|
87
|
+
# unmatched result part
|
88
|
+
if len(match.qps) == 0:
|
89
|
+
bias = weight_extra_match(match.rps, result)
|
90
|
+
match.weight = extra_result_weight * bias
|
91
|
+
# unmatched query part
|
92
|
+
elif len(match.rps) == 0:
|
93
|
+
bias = weight_extra_match(match.qps, query)
|
94
|
+
match.weight = extra_query_weight * bias
|
95
|
+
# We fall through here to apply the family-name boost to unmatched parts too.
|
96
|
+
|
97
|
+
# We have types of symbol matches and where we never score 1.0, but for
|
98
|
+
# literal matches, we always want to score 1.0
|
99
|
+
if match.score < 1.0 and match.qstr == match.rstr:
|
100
|
+
match.score = 1.0
|
101
|
+
# We treat family names matches as more important (but configurable) because
|
102
|
+
# they're just globally less murky and changeable than given names.
|
103
|
+
if match.is_family_name():
|
104
|
+
match.weight *= family_name_weight
|
105
|
+
|
106
|
+
|
107
|
+
# Sum up and average all the weights to get the final score for this pairing.
|
108
|
+
# score = sum(weights) / len(weights) if len(weights) > 0 else 0.0
|
109
|
+
total_weight = sum(match.weight for match in matches)
|
110
|
+
total_score = sum(match.weighted_score for match in matches)
|
111
|
+
score = total_score / total_weight if total_weight > 0 else 0.0
|
112
|
+
if score > retval.score:
|
113
|
+
detail = " ".join(str(m) for m in matches)
|
114
|
+
retval = FtResult(score=score, detail=detail)
|
115
|
+
if retval.detail is None:
|
116
|
+
retval.detail = f"{query.comparable!r}≉{result.comparable!r}"
|
117
|
+
return retval
|
118
|
+
|
119
|
+
|
120
|
+
def _get_object_names(entity: EntityProxy) -> Set[str]:
|
121
|
+
"""Get the names of an object entity, such as a vessel or asset."""
|
122
|
+
names = entity.get_type_values(registry.name, matchable=True)
|
123
|
+
if not names:
|
124
|
+
return set()
|
125
|
+
normalized = [normalize_name(name) for name in names]
|
126
|
+
return set([n for n in normalized if n is not None])
|
127
|
+
|
128
|
+
|
129
|
+
def match_object_names(query: E, result: E, config: ScoringConfig) -> FtResult:
|
130
|
+
"""Match the names of two objects, such as vessels or assets."""
|
131
|
+
result_names = _get_object_names(result)
|
132
|
+
mismatch_penalty = 1 - config.get_float("nm_number_mismatch")
|
133
|
+
best_result = FtResult(score=0.0, detail=None)
|
134
|
+
for query_name in _get_object_names(query):
|
135
|
+
query_name = remove_obj_prefixes(query_name)
|
136
|
+
for result_name in result_names:
|
137
|
+
result_name = remove_obj_prefixes(result_name)
|
138
|
+
score = strict_levenshtein(query_name, result_name, max_rate=5)
|
139
|
+
if score == 1.0:
|
140
|
+
detail = f"[{result_name!r} literalMatch]"
|
141
|
+
else:
|
142
|
+
detail = f"[{query_name!r}≈{result_name!r}, fuzzyMatch: {score:.2f}]"
|
143
|
+
if numbers_mismatch(query_name, result_name):
|
144
|
+
score = score * mismatch_penalty
|
145
|
+
detail = "Number mismatch"
|
146
|
+
if score > best_result.score:
|
147
|
+
best_result = FtResult(score=score, detail=detail)
|
148
|
+
return best_result
|
149
|
+
|
150
|
+
|
151
|
+
def name_match(query: E, result: E, config: ScoringConfig) -> FtResult:
|
152
|
+
"""Match two entities by analyzing and comparing their names."""
|
153
|
+
schema = model.common_schema(query.schema, result.schema)
|
154
|
+
type_tag = schema_type_tag(schema)
|
155
|
+
best = FtResult(score=0.0, detail=None)
|
156
|
+
if type_tag == NameTypeTag.UNK:
|
157
|
+
# Name matching is not supported for entities that are not listed
|
158
|
+
# as a person, organization, or a thing.
|
159
|
+
best.detail = "Unsuited for name matching: %s" % schema.name
|
160
|
+
return best
|
161
|
+
if type_tag == NameTypeTag.OBJ:
|
162
|
+
return match_object_names(query, result, config)
|
163
|
+
query_names = entity_names(type_tag, query, is_query=True)
|
164
|
+
result_names = entity_names(type_tag, result)
|
165
|
+
|
166
|
+
# For literal matches, return early instead of performing all the magic. This addresses
|
167
|
+
# a user surprise where literal matches can score below 1.0 after name de-duplication has
|
168
|
+
# only left a superset name on one side.
|
169
|
+
query_comparable = {name.comparable: name for name in query_names}
|
170
|
+
result_comparable = {name.comparable: name for name in result_names}
|
171
|
+
common = set(query_comparable).intersection(result_comparable)
|
172
|
+
if len(common) > 0:
|
173
|
+
longest = max(common, key=len)
|
174
|
+
match = Match(
|
175
|
+
qps=query_comparable[longest].parts,
|
176
|
+
rps=result_comparable[longest].parts,
|
177
|
+
score=1.0,
|
178
|
+
)
|
179
|
+
return FtResult(score=match.score, detail=str(match))
|
180
|
+
|
181
|
+
# Remove short names that are contained in longer names.
|
182
|
+
# This prevents a scenario where a short version of a name ("John
|
183
|
+
# Smith") is matched to a query ("John K Smith"), where a longer version
|
184
|
+
# ("John K Smith" != "John R Smith") would have disqualified the match.
|
185
|
+
query_names = Name.consolidate_names(query_names)
|
186
|
+
result_names = Name.consolidate_names(result_names)
|
187
|
+
|
188
|
+
for query_name in query_names:
|
189
|
+
for result_name in result_names:
|
190
|
+
ftres = match_name_symbolic(query_name, result_name, config)
|
191
|
+
if ftres.score >= best.score:
|
192
|
+
best = ftres
|
193
|
+
if best.detail is None:
|
194
|
+
best.detail = "No names available for matching"
|
195
|
+
return best
|
@@ -0,0 +1,81 @@
|
|
1
|
+
from typing import List, Set
|
2
|
+
|
3
|
+
from rigour.names import NamePart, Symbol, Span
|
4
|
+
|
5
|
+
from nomenklatura.matching.logic_v2.names.magic import SYM_SCORES, SYM_WEIGHTS
|
6
|
+
from nomenklatura.matching.logic_v2.names.util import Match
|
7
|
+
|
8
|
+
|
9
|
+
class Pairing:
|
10
|
+
__slots__ = [
|
11
|
+
"query_used",
|
12
|
+
"result_used",
|
13
|
+
"matches",
|
14
|
+
"_hash",
|
15
|
+
]
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
query_used: Set[NamePart],
|
20
|
+
result_used: Set[NamePart],
|
21
|
+
matches: List[Match],
|
22
|
+
) -> None:
|
23
|
+
self.query_used = query_used
|
24
|
+
self.result_used = result_used
|
25
|
+
self.matches = matches
|
26
|
+
|
27
|
+
@classmethod
|
28
|
+
def empty(cls) -> "Pairing":
|
29
|
+
"""Create a new pairing with no matches."""
|
30
|
+
return cls(
|
31
|
+
query_used=set(),
|
32
|
+
result_used=set(),
|
33
|
+
matches=[],
|
34
|
+
)
|
35
|
+
|
36
|
+
def can_pair(self, query_span: Span, result_span: Span) -> bool:
|
37
|
+
"""Check if two spans can be paired."""
|
38
|
+
if self.query_used.intersection(query_span.parts):
|
39
|
+
return False
|
40
|
+
if self.result_used.intersection(result_span.parts):
|
41
|
+
return False
|
42
|
+
|
43
|
+
# If the text is actually identical, we do not need to establish
|
44
|
+
# a pairing, as it is already a match.
|
45
|
+
# revised: This doesn't work because it knocks out the stopword-like functionality of
|
46
|
+
# symbolic matching.
|
47
|
+
# if query_span.form == result_span.form:
|
48
|
+
# return False
|
49
|
+
|
50
|
+
# Check if one at least of the two span parts is a name initial
|
51
|
+
if query_span.symbol.category == Symbol.Category.INITIAL:
|
52
|
+
if len(query_span.parts[0]) > 1 and len(result_span.parts[0]) > 1:
|
53
|
+
return False
|
54
|
+
|
55
|
+
if query_span.symbol.category in (Symbol.Category.NAME, Symbol.Category.NICK):
|
56
|
+
# This may not be correct for many tokens since it's expected that the bulk
|
57
|
+
# of filtered items will be single-part span.
|
58
|
+
for qp, rp in zip(query_span.parts, result_span.parts):
|
59
|
+
if not qp.can_match(rp):
|
60
|
+
return False
|
61
|
+
|
62
|
+
return True
|
63
|
+
|
64
|
+
def add(self, query_span: Span, result_span: Span) -> "Pairing":
|
65
|
+
"""Add a pair of spans to the pairing."""
|
66
|
+
symbol = query_span.symbol
|
67
|
+
# Some types of symbols effectively also work as soft stopwords, reducing the relevance
|
68
|
+
# of the match. For example, "Ltd." in an organization name is not as informative as a
|
69
|
+
# person's first name. That's why we're assigning a low weight, even for literal matches.
|
70
|
+
match = Match(
|
71
|
+
qps=query_span.parts,
|
72
|
+
rps=result_span.parts,
|
73
|
+
symbol=query_span.symbol,
|
74
|
+
score=SYM_SCORES.get(symbol.category, 1.0),
|
75
|
+
weight=SYM_WEIGHTS.get(symbol.category, 1.0),
|
76
|
+
)
|
77
|
+
return Pairing(
|
78
|
+
self.query_used.union(query_span.parts),
|
79
|
+
self.result_used.union(result_span.parts),
|
80
|
+
self.matches + [match],
|
81
|
+
)
|
@@ -0,0 +1,89 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
from typing import Optional, Any, Sequence
|
4
|
+
from rigour.names import NamePart, Symbol, NamePartTag
|
5
|
+
|
6
|
+
|
7
|
+
class Match:
|
8
|
+
"""A Match combines query and result name parts, along with a score and weight. It is one
|
9
|
+
part of the matching result, which is eventually aggregated into a final score."""
|
10
|
+
|
11
|
+
__slots__ = ["qps", "rps", "symbol", "score", "weight"]
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
qps: Sequence[NamePart] = [],
|
16
|
+
rps: Sequence[NamePart] = [],
|
17
|
+
symbol: Optional[Symbol] = None,
|
18
|
+
score: float = 0.0,
|
19
|
+
weight: float = 1.0,
|
20
|
+
) -> None:
|
21
|
+
"""Initialize the Match object with query and result parts."""
|
22
|
+
self.qps = list(qps)
|
23
|
+
self.rps = list(rps)
|
24
|
+
self.symbol: Optional[Symbol] = symbol
|
25
|
+
self.score = score
|
26
|
+
self.weight = weight
|
27
|
+
|
28
|
+
@property
|
29
|
+
def weighted_score(self) -> float:
|
30
|
+
"""Calculate the weighted score."""
|
31
|
+
return self.score * self.weight
|
32
|
+
|
33
|
+
@property
|
34
|
+
def qstr(self) -> str:
|
35
|
+
"""Get the query string representation."""
|
36
|
+
return " ".join([part.comparable for part in self.qps])
|
37
|
+
|
38
|
+
@property
|
39
|
+
def rstr(self) -> str:
|
40
|
+
"""Get the result string representation."""
|
41
|
+
return " ".join([part.comparable for part in self.rps])
|
42
|
+
|
43
|
+
def is_family_name(self) -> bool:
|
44
|
+
"""Check if the match represents a family name."""
|
45
|
+
for np in self.qps:
|
46
|
+
if np.tag == NamePartTag.FAMILY:
|
47
|
+
return True
|
48
|
+
for np in self.rps:
|
49
|
+
if np.tag == NamePartTag.FAMILY:
|
50
|
+
return True
|
51
|
+
return False
|
52
|
+
|
53
|
+
def __hash__(self) -> int:
|
54
|
+
"""Hash the Match object based on query and result parts."""
|
55
|
+
return hash((self.symbol, tuple(self.qps), tuple(self.rps)))
|
56
|
+
|
57
|
+
def __eq__(self, other: Any) -> bool:
|
58
|
+
"""Check equality of two Match objects based on query and result parts."""
|
59
|
+
return hash(self) == hash(other)
|
60
|
+
|
61
|
+
def __repr__(self) -> str:
|
62
|
+
"""String representation of the Match object."""
|
63
|
+
return f"<Match({str(self)})>"
|
64
|
+
|
65
|
+
def __str__(self) -> str:
|
66
|
+
"""String representation of the Match object for debugging."""
|
67
|
+
qps_str = self.qstr
|
68
|
+
rps_str = self.rstr
|
69
|
+
if self.symbol is not None:
|
70
|
+
explanation = f"{qps_str!r}≈{rps_str!r} symbolMatch {self.symbol}"
|
71
|
+
elif not len(qps_str):
|
72
|
+
explanation = f"{rps_str!r} extraResultPart"
|
73
|
+
elif not len(rps_str):
|
74
|
+
explanation = f"{qps_str!r} extraQueryPart"
|
75
|
+
elif qps_str == rps_str:
|
76
|
+
explanation = f"{rps_str!r} literalMatch"
|
77
|
+
else:
|
78
|
+
explanation = f"{qps_str!r}≈{rps_str!r} fuzzyMatch"
|
79
|
+
return f"[{explanation}: {self.score:.2f}, weight {self.weight:.2f}]"
|
80
|
+
|
81
|
+
|
82
|
+
NUMERIC = re.compile(r"\d{1,}")
|
83
|
+
|
84
|
+
|
85
|
+
def numbers_mismatch(query: str, result: str) -> bool:
|
86
|
+
"""Check if the number of numerals in two names is different."""
|
87
|
+
query_nums = set(NUMERIC.findall(query))
|
88
|
+
result_nums = set(NUMERIC.findall(result))
|
89
|
+
return len(query_nums.difference(result_nums)) > 0
|