nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from pydantic import BaseModel
|
3
|
+
from typing import List, Dict, Optional, Callable, Union, cast
|
4
|
+
from followthemoney.proxy import E, EntityProxy
|
5
|
+
|
6
|
+
from nomenklatura.matching.util import make_github_url, FNUL
|
7
|
+
|
8
|
+
Encoded = List[float]
|
9
|
+
CompareFunction = Callable[[EntityProxy, EntityProxy], float]
|
10
|
+
FeatureCompareFunction = Callable[[EntityProxy, EntityProxy], "FtResult"]
|
11
|
+
FeatureCompareConfigured = Callable[
|
12
|
+
[EntityProxy, EntityProxy, "ScoringConfig"], "FtResult"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
class FeatureDoc(BaseModel):
|
17
|
+
"""Documentation for a particular feature in the matching API model."""
|
18
|
+
|
19
|
+
description: Optional[str]
|
20
|
+
coefficient: float
|
21
|
+
url: str
|
22
|
+
|
23
|
+
|
24
|
+
FeatureDocs = Dict[str, FeatureDoc]
|
25
|
+
|
26
|
+
|
27
|
+
class ConfigVarType(str, Enum):
|
28
|
+
"""The type of a configuration variable."""
|
29
|
+
|
30
|
+
INTEGER = "integer"
|
31
|
+
FLOAT = "float"
|
32
|
+
BOOLEAN = "boolean"
|
33
|
+
|
34
|
+
|
35
|
+
class ConfigVar(BaseModel):
|
36
|
+
"""A configuration variable for a scoring algorithm."""
|
37
|
+
|
38
|
+
type: ConfigVarType = ConfigVarType.FLOAT
|
39
|
+
description: Optional[str] = None
|
40
|
+
default: Union[str, int, float, bool] = 0
|
41
|
+
|
42
|
+
|
43
|
+
class AlgorithmDocs(BaseModel):
|
44
|
+
"""Documentation for a scoring algorithm."""
|
45
|
+
|
46
|
+
name: str
|
47
|
+
description: Optional[str] = None
|
48
|
+
config: Dict[str, ConfigVar]
|
49
|
+
features: FeatureDocs
|
50
|
+
|
51
|
+
|
52
|
+
class FtResult(BaseModel):
|
53
|
+
"""A explained score for a particular feature result."""
|
54
|
+
|
55
|
+
detail: Optional[str]
|
56
|
+
score: float
|
57
|
+
|
58
|
+
def empty(self) -> bool:
|
59
|
+
"""Check if the result is empty."""
|
60
|
+
return self.detail is None and self.score == FNUL
|
61
|
+
|
62
|
+
@classmethod
|
63
|
+
def wrap(cls, func: CompareFunction) -> FeatureCompareFunction:
|
64
|
+
"""Wrap a score and detail into a feature result."""
|
65
|
+
|
66
|
+
def wrapper(query: E, result: E) -> "FtResult":
|
67
|
+
return cls(score=func(query, result), detail=None)
|
68
|
+
|
69
|
+
wrapper.__name__ = func.__name__
|
70
|
+
wrapper.__doc__ = func.__doc__
|
71
|
+
return wrapper
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def unwrap(cls, func: FeatureCompareConfigured) -> CompareFunction:
|
75
|
+
"""Unwrap a feature result returned by a comparator into a score."""
|
76
|
+
config = ScoringConfig.defaults()
|
77
|
+
|
78
|
+
def wrapper(query: E, result: E) -> float:
|
79
|
+
return func(query, result, config).score
|
80
|
+
|
81
|
+
wrapper.__name__ = func.__name__
|
82
|
+
wrapper.__doc__ = func.__doc__
|
83
|
+
return wrapper
|
84
|
+
|
85
|
+
def __repr__(self) -> str:
|
86
|
+
"""Return a string representation of the feature result."""
|
87
|
+
return f"<FtR({self.score}, {self.detail!r})>"
|
88
|
+
|
89
|
+
|
90
|
+
class MatchingResult(BaseModel):
|
91
|
+
"""Score and feature comparison results for matching comparison."""
|
92
|
+
|
93
|
+
score: float
|
94
|
+
features: Dict[str, float]
|
95
|
+
explanations: Dict[str, FtResult]
|
96
|
+
|
97
|
+
@classmethod
|
98
|
+
def make(cls, score: float, explanations: Dict[str, FtResult]) -> "MatchingResult":
|
99
|
+
"""Create a new matching result."""
|
100
|
+
explanations = {k: v for k, v in explanations.items() if not v.empty()}
|
101
|
+
features = {k: v.score for k, v in explanations.items() if v.score != FNUL}
|
102
|
+
return cls(score=score, features=features, explanations=explanations)
|
103
|
+
|
104
|
+
def __repr__(self) -> str:
|
105
|
+
"""Return a string representation of the matching result."""
|
106
|
+
return f"<MR({self.score}, expl={self.explanations})>"
|
107
|
+
|
108
|
+
|
109
|
+
class ScoringConfig(BaseModel):
|
110
|
+
"""Configuration for a scoring algorithm."""
|
111
|
+
|
112
|
+
weights: Dict[str, float]
|
113
|
+
config: Dict[str, Union[str, int, float, bool]]
|
114
|
+
|
115
|
+
@classmethod
|
116
|
+
def defaults(cls) -> "ScoringConfig":
|
117
|
+
"""Return the default configuration."""
|
118
|
+
return cls(weights={}, config={})
|
119
|
+
|
120
|
+
def get_float(self, key: str) -> float:
|
121
|
+
"""Get a float value from the configuration."""
|
122
|
+
return float(self.config[key])
|
123
|
+
|
124
|
+
|
125
|
+
class ScoringAlgorithm(object):
|
126
|
+
"""An implementation of a scoring system that compares two entities."""
|
127
|
+
|
128
|
+
NAME = "algorithm_name"
|
129
|
+
CONFIG: Dict[str, ConfigVar] = {}
|
130
|
+
|
131
|
+
@classmethod
|
132
|
+
def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
|
133
|
+
"""Compare the two entities and return a score and feature comparison."""
|
134
|
+
raise NotImplementedError
|
135
|
+
|
136
|
+
@classmethod
|
137
|
+
def get_feature_docs(cls) -> FeatureDocs:
|
138
|
+
"""Return an explanation of the features and their coefficients."""
|
139
|
+
raise NotImplementedError
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
def get_docs(cls) -> AlgorithmDocs:
|
143
|
+
"""Return an explanation of the algorithm and its features."""
|
144
|
+
return AlgorithmDocs(
|
145
|
+
name=cls.NAME,
|
146
|
+
description=cls.__doc__,
|
147
|
+
config=cls.CONFIG,
|
148
|
+
features=cls.get_feature_docs(),
|
149
|
+
)
|
150
|
+
|
151
|
+
@classmethod
|
152
|
+
def default_config(cls) -> ScoringConfig:
|
153
|
+
"""Return the default configuration for the algorithm."""
|
154
|
+
return ScoringConfig.defaults()
|
155
|
+
|
156
|
+
|
157
|
+
class Feature(BaseModel):
|
158
|
+
func: Union[FeatureCompareFunction, FeatureCompareConfigured]
|
159
|
+
weight: float
|
160
|
+
qualifier: bool = False
|
161
|
+
|
162
|
+
@property
|
163
|
+
def name(self) -> str:
|
164
|
+
return self.func.__name__
|
165
|
+
|
166
|
+
@property
|
167
|
+
def doc(self) -> FeatureDoc:
|
168
|
+
description = self.func.__doc__
|
169
|
+
assert description is not None, self.func.__name__
|
170
|
+
return FeatureDoc(
|
171
|
+
description=description,
|
172
|
+
coefficient=self.weight,
|
173
|
+
url=make_github_url(self.func),
|
174
|
+
)
|
175
|
+
|
176
|
+
def invoke(self, query: E, result: E, config: ScoringConfig) -> FtResult:
|
177
|
+
"""Invoke the feature function and return the result."""
|
178
|
+
if self.func.__code__.co_argcount == 3:
|
179
|
+
func = cast(FeatureCompareConfigured, self.func)
|
180
|
+
return func(query, result, config)
|
181
|
+
else:
|
182
|
+
func = cast(FeatureCompareFunction, self.func) # type: ignore
|
183
|
+
return func(query, result) # type: ignore
|
184
|
+
|
185
|
+
|
186
|
+
class HeuristicAlgorithm(ScoringAlgorithm):
|
187
|
+
features: List[Feature]
|
188
|
+
|
189
|
+
@classmethod
|
190
|
+
def compute_score(
|
191
|
+
cls, scores: Dict[str, float], weights: Dict[str, float]
|
192
|
+
) -> float:
|
193
|
+
raise NotImplementedError
|
194
|
+
|
195
|
+
@classmethod
|
196
|
+
def get_feature_docs(cls) -> FeatureDocs:
|
197
|
+
return {f.name: f.doc for f in cls.features}
|
198
|
+
|
199
|
+
@classmethod
|
200
|
+
def default_config(cls) -> ScoringConfig:
|
201
|
+
"""Return the default configuration for the algorithm."""
|
202
|
+
config = ScoringConfig.defaults()
|
203
|
+
for name, var in cls.CONFIG.items():
|
204
|
+
config.config[name] = var.default
|
205
|
+
return config
|
206
|
+
|
207
|
+
@classmethod
|
208
|
+
def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
|
209
|
+
if not query.schema.can_match(result.schema):
|
210
|
+
if not query.schema.name == result.schema.name:
|
211
|
+
return MatchingResult.make(FNUL, {})
|
212
|
+
|
213
|
+
for name, var in cls.CONFIG.items():
|
214
|
+
if config.config.get(name) is None:
|
215
|
+
config.config[name] = var.default
|
216
|
+
|
217
|
+
explanations: Dict[str, FtResult] = {}
|
218
|
+
scores: Dict[str, float] = {}
|
219
|
+
weights: Dict[str, float] = {}
|
220
|
+
for feature in cls.features:
|
221
|
+
weights[feature.name] = config.weights.get(feature.name, feature.weight)
|
222
|
+
if weights[feature.name] != FNUL:
|
223
|
+
explanations[feature.name] = feature.invoke(query, result, config)
|
224
|
+
scores[feature.name] = explanations[feature.name].score
|
225
|
+
score = cls.compute_score(scores, weights)
|
226
|
+
score = min(1.0, max(FNUL, score))
|
227
|
+
return MatchingResult.make(score=score, explanations=explanations)
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from itertools import product
|
3
|
+
from typing import List, Set, TypeVar, Tuple, Iterable, Optional, Callable, Any
|
4
|
+
from followthemoney.proxy import E
|
5
|
+
from followthemoney.types.common import PropertyType
|
6
|
+
|
7
|
+
from nomenklatura import __version__
|
8
|
+
from nomenklatura.util import DATA_PATH
|
9
|
+
|
10
|
+
V = TypeVar("V")
|
11
|
+
BASE_URL = "https://github.com/opensanctions/nomenklatura/blob/%s/nomenklatura/%s#L%s"
|
12
|
+
CODE_PATH = DATA_PATH.joinpath("..").resolve()
|
13
|
+
FNUL = 0.0
|
14
|
+
|
15
|
+
|
16
|
+
def has_schema(left: E, right: E, schema: str) -> bool:
|
17
|
+
"""Check if one of the entities has the required schema."""
|
18
|
+
if left.schema.is_a(schema) or right.schema.is_a(schema):
|
19
|
+
return True
|
20
|
+
return False
|
21
|
+
|
22
|
+
|
23
|
+
def props_pair(left: E, right: E, props: List[str]) -> Tuple[Set[str], Set[str]]:
|
24
|
+
left_values: Set[str] = set()
|
25
|
+
right_values: Set[str] = set()
|
26
|
+
for prop in props:
|
27
|
+
left_values.update(left.get(prop, quiet=True))
|
28
|
+
right_values.update(right.get(prop, quiet=True))
|
29
|
+
return left_values, right_values
|
30
|
+
|
31
|
+
|
32
|
+
def type_pair(left: E, right: E, type_: PropertyType) -> Tuple[List[str], List[str]]:
|
33
|
+
left_values = left.get_type_values(type_, matchable=True)
|
34
|
+
right_values = right.get_type_values(type_, matchable=True)
|
35
|
+
return left_values, right_values
|
36
|
+
|
37
|
+
|
38
|
+
def max_in_sets(
|
39
|
+
left: Iterable[Optional[V]],
|
40
|
+
right: Iterable[Optional[V]],
|
41
|
+
compare_func: Callable[[V, V], float],
|
42
|
+
max_res: float = 1.0,
|
43
|
+
) -> float:
|
44
|
+
"""Compare two sets of values pair-wise and select the highest-scored result."""
|
45
|
+
res: float = 0.0
|
46
|
+
for le, ri in product(left, right):
|
47
|
+
if le is None or ri is None:
|
48
|
+
continue
|
49
|
+
v = compare_func(le, ri)
|
50
|
+
if v <= res:
|
51
|
+
continue
|
52
|
+
res = v
|
53
|
+
if res >= max_res:
|
54
|
+
return res
|
55
|
+
return res
|
56
|
+
|
57
|
+
|
58
|
+
def make_github_url(func: Callable[..., Any]) -> str:
|
59
|
+
"""Make a URL to the source code of a matching function."""
|
60
|
+
code_path = Path(func.__code__.co_filename).relative_to(CODE_PATH)
|
61
|
+
line_no = func.__code__.co_firstlineno
|
62
|
+
return BASE_URL % (__version__, code_path, line_no)
|
File without changes
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from functools import cache
|
2
|
+
from typing import Iterable, List, Tuple
|
3
|
+
from followthemoney import registry, SE
|
4
|
+
|
5
|
+
|
6
|
+
PROV_MIN_DATES = ("createdAt", "authoredAt", "publishedAt")
|
7
|
+
PROV_MAX_DATES = ("modifiedAt", "retrievedAt")
|
8
|
+
|
9
|
+
|
10
|
+
def simplify_dates(entity: SE) -> SE:
|
11
|
+
"""If an entity has multiple values for a date field, you may
|
12
|
+
want to remove all those that are prefixes of others. For example,
|
13
|
+
if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
|
14
|
+
want to drop the mention of 1990."""
|
15
|
+
for prop in entity.iterprops():
|
16
|
+
if prop.type == registry.date:
|
17
|
+
# This is super unrolled in order to make it fast, its called
|
18
|
+
# a lot during data exports. We shouldn't re-use this function
|
19
|
+
# code in less perf critical contexts.
|
20
|
+
stmts = entity._statements[prop.name]
|
21
|
+
if len(stmts) < 2:
|
22
|
+
continue
|
23
|
+
values_in = tuple({s.value for s in stmts})
|
24
|
+
if len(values_in) < 2:
|
25
|
+
continue
|
26
|
+
values = remove_prefix_date_values(values_in)
|
27
|
+
if prop.name in PROV_MAX_DATES:
|
28
|
+
values = (max(values),)
|
29
|
+
elif prop.name in PROV_MIN_DATES:
|
30
|
+
values = (min(values),)
|
31
|
+
|
32
|
+
for stmt in list(stmts):
|
33
|
+
if stmt.value not in values:
|
34
|
+
entity._statements[prop.name].remove(stmt)
|
35
|
+
return entity
|
36
|
+
|
37
|
+
|
38
|
+
@cache
|
39
|
+
def remove_prefix_date_values(values: Tuple[str]) -> Iterable[str]:
|
40
|
+
"""See ``remove_prefix_dates``."""
|
41
|
+
kept: List[str] = []
|
42
|
+
values_list = sorted(values, reverse=True)
|
43
|
+
for index, value in enumerate(values_list):
|
44
|
+
if index > 0:
|
45
|
+
longer = values_list[index - 1]
|
46
|
+
if longer.startswith(value):
|
47
|
+
continue
|
48
|
+
kept.append(value)
|
49
|
+
return kept
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from followthemoney import SE
|
2
|
+
from nomenklatura.resolver import Identifier
|
3
|
+
|
4
|
+
|
5
|
+
def simplify_undirected(entity: SE) -> SE:
|
6
|
+
"""Simplify undirected edges by removing duplicate entity IDs on both
|
7
|
+
ends."""
|
8
|
+
# Problem: undirected relationships in which both
|
9
|
+
# entities are given as the source AND the target
|
10
|
+
if (
|
11
|
+
not entity.schema.edge
|
12
|
+
or entity.schema.edge_directed
|
13
|
+
or not entity.schema.edge_source
|
14
|
+
or not entity.schema.edge_target
|
15
|
+
):
|
16
|
+
return entity
|
17
|
+
sources = entity.get_statements(entity.schema.edge_source)
|
18
|
+
targets = entity.get_statements(entity.schema.edge_target)
|
19
|
+
source_ids = set((s.value for s in sources))
|
20
|
+
target_ids = set((t.value for t in targets))
|
21
|
+
common = source_ids.intersection(target_ids)
|
22
|
+
if len(common) != 2:
|
23
|
+
return entity
|
24
|
+
identifiers = [Identifier.get(s) for s in common]
|
25
|
+
source_id, target_id = max(identifiers), min(identifiers)
|
26
|
+
for stmt in sources:
|
27
|
+
if stmt.value == target_id:
|
28
|
+
entity._statements[entity.schema.edge_source].remove(stmt)
|
29
|
+
for stmt in targets:
|
30
|
+
if stmt.value == source_id:
|
31
|
+
entity._statements[entity.schema.edge_target].remove(stmt)
|
32
|
+
return entity
|
nomenklatura/py.typed
ADDED
File without changes
|
@@ -0,0 +1,6 @@
|
|
1
|
+
from nomenklatura.resolver.identifier import Identifier, StrIdent, Pair
|
2
|
+
from nomenklatura.resolver.edge import Edge
|
3
|
+
from nomenklatura.resolver.linker import Linker
|
4
|
+
from nomenklatura.resolver.resolver import Resolver
|
5
|
+
|
6
|
+
__all__ = ["Identifier", "StrIdent", "Pair", "Edge", "Linker", "Resolver"]
|
@@ -0,0 +1,107 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Any, Dict, Optional, Union
|
3
|
+
|
4
|
+
from sqlalchemy.engine import RowMapping
|
5
|
+
|
6
|
+
from nomenklatura.judgement import Judgement
|
7
|
+
from nomenklatura.resolver.identifier import Identifier, StrIdent
|
8
|
+
|
9
|
+
|
10
|
+
class Edge(object):
|
11
|
+
__slots__ = (
|
12
|
+
"key",
|
13
|
+
"source",
|
14
|
+
"target",
|
15
|
+
"judgement",
|
16
|
+
"score",
|
17
|
+
"user",
|
18
|
+
"created_at",
|
19
|
+
"deleted_at",
|
20
|
+
)
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
left_id: StrIdent,
|
25
|
+
right_id: StrIdent,
|
26
|
+
judgement: Judgement = Judgement.NO_JUDGEMENT,
|
27
|
+
score: Optional[float] = None,
|
28
|
+
user: Optional[str] = None,
|
29
|
+
created_at: Optional[str] = None,
|
30
|
+
deleted_at: Optional[str] = None,
|
31
|
+
):
|
32
|
+
self.key = Identifier.pair(left_id, right_id)
|
33
|
+
self.target, self.source = self.key
|
34
|
+
self.judgement = judgement
|
35
|
+
self.score = score
|
36
|
+
self.user = user
|
37
|
+
self.created_at = created_at
|
38
|
+
self.deleted_at = deleted_at
|
39
|
+
|
40
|
+
def other(self, cur: Identifier) -> Identifier:
|
41
|
+
if cur == self.target:
|
42
|
+
return self.source
|
43
|
+
return self.target
|
44
|
+
|
45
|
+
def to_dict(self) -> Dict[str, Any]:
|
46
|
+
return {
|
47
|
+
"target": self.target.id,
|
48
|
+
"source": self.source.id,
|
49
|
+
"judgement": self.judgement.value,
|
50
|
+
"score": self.score,
|
51
|
+
"user": self.user,
|
52
|
+
"created_at": self.created_at,
|
53
|
+
"deleted_at": self.deleted_at,
|
54
|
+
}
|
55
|
+
|
56
|
+
def to_line(self) -> str:
|
57
|
+
row = [
|
58
|
+
self.target.id,
|
59
|
+
self.source.id,
|
60
|
+
self.judgement.value,
|
61
|
+
self.score,
|
62
|
+
self.user,
|
63
|
+
self.created_at,
|
64
|
+
]
|
65
|
+
return json.dumps(row) + "\n"
|
66
|
+
|
67
|
+
def __str__(self) -> str:
|
68
|
+
return self.to_line()
|
69
|
+
|
70
|
+
def __hash__(self) -> int:
|
71
|
+
return hash(self.key)
|
72
|
+
|
73
|
+
def __eq__(self, other: Any) -> bool:
|
74
|
+
return hash(self) == hash(other)
|
75
|
+
|
76
|
+
def __lt__(self, other: Any) -> bool:
|
77
|
+
return bool(self.key < other.key)
|
78
|
+
|
79
|
+
def __repr__(self) -> str:
|
80
|
+
return f"<E({self.target.id}, {self.source.id}, {self.judgement.value})>"
|
81
|
+
|
82
|
+
@classmethod
|
83
|
+
def from_line(cls, line: str) -> "Edge":
|
84
|
+
data = json.loads(line)
|
85
|
+
edge = cls(
|
86
|
+
data[0],
|
87
|
+
data[1],
|
88
|
+
judgement=Judgement(data[2]),
|
89
|
+
score=data[3],
|
90
|
+
user=data[4],
|
91
|
+
created_at=data[5],
|
92
|
+
)
|
93
|
+
if len(data) > 6:
|
94
|
+
edge.deleted_at = data[6]
|
95
|
+
return edge
|
96
|
+
|
97
|
+
@classmethod
|
98
|
+
def from_dict(cls, data: Union[RowMapping, Dict[str, Any]]) -> "Edge":
|
99
|
+
return cls(
|
100
|
+
left_id=data["target"],
|
101
|
+
right_id=data["source"],
|
102
|
+
judgement=Judgement(data["judgement"]),
|
103
|
+
score=data["score"],
|
104
|
+
user=data["user"],
|
105
|
+
created_at=data.get("created_at"),
|
106
|
+
deleted_at=data.get("deleted_at"),
|
107
|
+
)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
import shortuuid
|
2
|
+
from typing import Any, Optional, Tuple, Union
|
3
|
+
from rigour.ids.wikidata import is_qid
|
4
|
+
|
5
|
+
from nomenklatura.resolver.common import ResolverLogicError
|
6
|
+
|
7
|
+
StrIdent = Union[str, "Identifier"]
|
8
|
+
Pair = Tuple["Identifier", "Identifier"]
|
9
|
+
|
10
|
+
|
11
|
+
class Identifier(object):
|
12
|
+
PREFIX = "NK-"
|
13
|
+
|
14
|
+
__slots__ = ("id", "canonical", "weight")
|
15
|
+
|
16
|
+
def __init__(self, id: str):
|
17
|
+
self.id = id
|
18
|
+
self.weight: int = 1
|
19
|
+
if self.id.startswith(self.PREFIX):
|
20
|
+
self.weight = 2
|
21
|
+
elif is_qid(id):
|
22
|
+
self.weight = 3
|
23
|
+
self.canonical = self.weight > 1
|
24
|
+
|
25
|
+
def __eq__(self, other: Any) -> bool:
|
26
|
+
return self.id == str(other)
|
27
|
+
|
28
|
+
def __lt__(self, other: Any) -> bool:
|
29
|
+
return (self.weight, self.id) < (other.weight, other.id)
|
30
|
+
|
31
|
+
def __str__(self) -> str:
|
32
|
+
return self.id
|
33
|
+
|
34
|
+
def __hash__(self) -> int:
|
35
|
+
return hash(self.id)
|
36
|
+
|
37
|
+
def __len__(self) -> int:
|
38
|
+
return len(self.id)
|
39
|
+
|
40
|
+
def __repr__(self) -> str:
|
41
|
+
return f"<I({self.id})>"
|
42
|
+
|
43
|
+
@classmethod
|
44
|
+
def get(cls, id: StrIdent) -> "Identifier":
|
45
|
+
if isinstance(id, str):
|
46
|
+
return cls(id)
|
47
|
+
return id
|
48
|
+
|
49
|
+
@classmethod
|
50
|
+
def pair(cls, left_id: StrIdent, right_id: StrIdent) -> Pair:
|
51
|
+
left = cls.get(left_id)
|
52
|
+
right = cls.get(right_id)
|
53
|
+
if left == right:
|
54
|
+
raise ResolverLogicError("%s/%s" % (left, right))
|
55
|
+
return (max(left, right), min(left, right))
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def make(cls, value: Optional[str] = None) -> "Identifier":
|
59
|
+
key = value or shortuuid.uuid()
|
60
|
+
return cls.get(f"{cls.PREFIX}{key}")
|
@@ -0,0 +1,101 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
from typing import Dict, Generator, Generic, Set
|
3
|
+
from followthemoney import registry, ValueEntity, Statement, SE
|
4
|
+
|
5
|
+
from nomenklatura.resolver.identifier import Identifier, StrIdent
|
6
|
+
|
7
|
+
|
8
|
+
class Linker(Generic[SE]):
|
9
|
+
"""A class to manage the canonicalisation of entities. This stores only the positive
|
10
|
+
merges of entities and is used as a lightweight way to apply the harmonisation
|
11
|
+
post de-duplication."""
|
12
|
+
|
13
|
+
def __init__(self, entities: Dict[Identifier, Set[Identifier]] = {}) -> None:
|
14
|
+
"""
|
15
|
+
Args:
|
16
|
+
entities: an entry for each entity with its connected set of entities.
|
17
|
+
"""
|
18
|
+
self._entities: Dict[Identifier, Set[Identifier]] = entities
|
19
|
+
|
20
|
+
def connected(self, node: Identifier) -> Set[Identifier]:
|
21
|
+
return self._entities.get(node, set([node]))
|
22
|
+
|
23
|
+
@lru_cache(maxsize=1024)
|
24
|
+
def get_canonical(self, entity_id: StrIdent) -> str:
|
25
|
+
"""Return the canonical identifier for the given entity ID."""
|
26
|
+
node = Identifier.get(entity_id)
|
27
|
+
best = max(self.connected(node))
|
28
|
+
if best.canonical:
|
29
|
+
return best.id
|
30
|
+
return node.id
|
31
|
+
|
32
|
+
def canonicals(self) -> Generator[Identifier, None, None]:
|
33
|
+
"""Return all the canonical cluster identifiers."""
|
34
|
+
for node in self._entities.keys():
|
35
|
+
if not node.canonical:
|
36
|
+
continue
|
37
|
+
canonical = self.get_canonical(node)
|
38
|
+
if canonical == node.id:
|
39
|
+
yield node
|
40
|
+
|
41
|
+
def get_referents(
|
42
|
+
self, canonical_id: StrIdent, canonicals: bool = True
|
43
|
+
) -> Set[str]:
|
44
|
+
"""Get all the non-canonical entity identifiers which refer to a given
|
45
|
+
canonical identifier."""
|
46
|
+
node = Identifier.get(canonical_id)
|
47
|
+
referents: Set[str] = set()
|
48
|
+
for connected in self.connected(node):
|
49
|
+
if not canonicals and connected.canonical:
|
50
|
+
continue
|
51
|
+
if connected == node:
|
52
|
+
continue
|
53
|
+
referents.add(connected.id)
|
54
|
+
return referents
|
55
|
+
|
56
|
+
def apply(self, proxy: SE) -> SE:
|
57
|
+
"""Replace all entity references in a given proxy with their canonical
|
58
|
+
identifiers. This is essentially the harmonisation post de-dupe."""
|
59
|
+
if proxy.id is None:
|
60
|
+
return proxy
|
61
|
+
proxy.id = self.get_canonical(proxy.id)
|
62
|
+
return self.apply_properties(proxy)
|
63
|
+
|
64
|
+
def apply_stream(self, proxy: ValueEntity) -> ValueEntity:
|
65
|
+
if proxy.id is None:
|
66
|
+
return proxy
|
67
|
+
proxy.id = self.get_canonical(proxy.id)
|
68
|
+
for prop in proxy.iterprops():
|
69
|
+
if prop.type == registry.entity:
|
70
|
+
values = proxy.pop(prop)
|
71
|
+
for value in values:
|
72
|
+
proxy.unsafe_add(prop, self.get_canonical(value), cleaned=True)
|
73
|
+
return proxy
|
74
|
+
|
75
|
+
def apply_properties(self, proxy: SE) -> SE:
|
76
|
+
for stmt in proxy._iter_stmt():
|
77
|
+
if proxy.id is not None:
|
78
|
+
stmt.canonical_id = proxy.id
|
79
|
+
if stmt.prop_type == registry.entity.name:
|
80
|
+
canon_value = self.get_canonical(stmt.value)
|
81
|
+
if canon_value != stmt.value:
|
82
|
+
if stmt.original_value is None:
|
83
|
+
stmt.original_value = stmt.value
|
84
|
+
# NOTE: this means the key is out of whack here now
|
85
|
+
stmt.value = canon_value
|
86
|
+
return proxy
|
87
|
+
|
88
|
+
def apply_statement(self, stmt: Statement) -> Statement:
|
89
|
+
if stmt.entity_id is not None:
|
90
|
+
stmt.canonical_id = self.get_canonical(stmt.entity_id)
|
91
|
+
if stmt.prop_type == registry.entity.name:
|
92
|
+
canon_value = self.get_canonical(stmt.value)
|
93
|
+
if canon_value != stmt.value:
|
94
|
+
if stmt.original_value is None:
|
95
|
+
stmt.original_value = stmt.value
|
96
|
+
# NOTE: this means the key is out of whack here now
|
97
|
+
stmt.value = canon_value
|
98
|
+
return stmt
|
99
|
+
|
100
|
+
def __repr__(self) -> str:
|
101
|
+
return f"<Merger({len(self._entities)})>"
|