nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,227 @@
1
+ from enum import Enum
2
+ from pydantic import BaseModel
3
+ from typing import List, Dict, Optional, Callable, Union, cast
4
+ from followthemoney.proxy import E, EntityProxy
5
+
6
+ from nomenklatura.matching.util import make_github_url, FNUL
7
+
8
+ Encoded = List[float]
9
+ CompareFunction = Callable[[EntityProxy, EntityProxy], float]
10
+ FeatureCompareFunction = Callable[[EntityProxy, EntityProxy], "FtResult"]
11
+ FeatureCompareConfigured = Callable[
12
+ [EntityProxy, EntityProxy, "ScoringConfig"], "FtResult"
13
+ ]
14
+
15
+
16
+ class FeatureDoc(BaseModel):
17
+ """Documentation for a particular feature in the matching API model."""
18
+
19
+ description: Optional[str]
20
+ coefficient: float
21
+ url: str
22
+
23
+
24
+ FeatureDocs = Dict[str, FeatureDoc]
25
+
26
+
27
+ class ConfigVarType(str, Enum):
28
+ """The type of a configuration variable."""
29
+
30
+ INTEGER = "integer"
31
+ FLOAT = "float"
32
+ BOOLEAN = "boolean"
33
+
34
+
35
+ class ConfigVar(BaseModel):
36
+ """A configuration variable for a scoring algorithm."""
37
+
38
+ type: ConfigVarType = ConfigVarType.FLOAT
39
+ description: Optional[str] = None
40
+ default: Union[str, int, float, bool] = 0
41
+
42
+
43
+ class AlgorithmDocs(BaseModel):
44
+ """Documentation for a scoring algorithm."""
45
+
46
+ name: str
47
+ description: Optional[str] = None
48
+ config: Dict[str, ConfigVar]
49
+ features: FeatureDocs
50
+
51
+
52
+ class FtResult(BaseModel):
53
+ """A explained score for a particular feature result."""
54
+
55
+ detail: Optional[str]
56
+ score: float
57
+
58
+ def empty(self) -> bool:
59
+ """Check if the result is empty."""
60
+ return self.detail is None and self.score == FNUL
61
+
62
+ @classmethod
63
+ def wrap(cls, func: CompareFunction) -> FeatureCompareFunction:
64
+ """Wrap a score and detail into a feature result."""
65
+
66
+ def wrapper(query: E, result: E) -> "FtResult":
67
+ return cls(score=func(query, result), detail=None)
68
+
69
+ wrapper.__name__ = func.__name__
70
+ wrapper.__doc__ = func.__doc__
71
+ return wrapper
72
+
73
+ @classmethod
74
+ def unwrap(cls, func: FeatureCompareConfigured) -> CompareFunction:
75
+ """Unwrap a feature result returned by a comparator into a score."""
76
+ config = ScoringConfig.defaults()
77
+
78
+ def wrapper(query: E, result: E) -> float:
79
+ return func(query, result, config).score
80
+
81
+ wrapper.__name__ = func.__name__
82
+ wrapper.__doc__ = func.__doc__
83
+ return wrapper
84
+
85
+ def __repr__(self) -> str:
86
+ """Return a string representation of the feature result."""
87
+ return f"<FtR({self.score}, {self.detail!r})>"
88
+
89
+
90
+ class MatchingResult(BaseModel):
91
+ """Score and feature comparison results for matching comparison."""
92
+
93
+ score: float
94
+ features: Dict[str, float]
95
+ explanations: Dict[str, FtResult]
96
+
97
+ @classmethod
98
+ def make(cls, score: float, explanations: Dict[str, FtResult]) -> "MatchingResult":
99
+ """Create a new matching result."""
100
+ explanations = {k: v for k, v in explanations.items() if not v.empty()}
101
+ features = {k: v.score for k, v in explanations.items() if v.score != FNUL}
102
+ return cls(score=score, features=features, explanations=explanations)
103
+
104
+ def __repr__(self) -> str:
105
+ """Return a string representation of the matching result."""
106
+ return f"<MR({self.score}, expl={self.explanations})>"
107
+
108
+
109
+ class ScoringConfig(BaseModel):
110
+ """Configuration for a scoring algorithm."""
111
+
112
+ weights: Dict[str, float]
113
+ config: Dict[str, Union[str, int, float, bool]]
114
+
115
+ @classmethod
116
+ def defaults(cls) -> "ScoringConfig":
117
+ """Return the default configuration."""
118
+ return cls(weights={}, config={})
119
+
120
+ def get_float(self, key: str) -> float:
121
+ """Get a float value from the configuration."""
122
+ return float(self.config[key])
123
+
124
+
125
+ class ScoringAlgorithm(object):
126
+ """An implementation of a scoring system that compares two entities."""
127
+
128
+ NAME = "algorithm_name"
129
+ CONFIG: Dict[str, ConfigVar] = {}
130
+
131
+ @classmethod
132
+ def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
133
+ """Compare the two entities and return a score and feature comparison."""
134
+ raise NotImplementedError
135
+
136
+ @classmethod
137
+ def get_feature_docs(cls) -> FeatureDocs:
138
+ """Return an explanation of the features and their coefficients."""
139
+ raise NotImplementedError
140
+
141
+ @classmethod
142
+ def get_docs(cls) -> AlgorithmDocs:
143
+ """Return an explanation of the algorithm and its features."""
144
+ return AlgorithmDocs(
145
+ name=cls.NAME,
146
+ description=cls.__doc__,
147
+ config=cls.CONFIG,
148
+ features=cls.get_feature_docs(),
149
+ )
150
+
151
+ @classmethod
152
+ def default_config(cls) -> ScoringConfig:
153
+ """Return the default configuration for the algorithm."""
154
+ return ScoringConfig.defaults()
155
+
156
+
157
+ class Feature(BaseModel):
158
+ func: Union[FeatureCompareFunction, FeatureCompareConfigured]
159
+ weight: float
160
+ qualifier: bool = False
161
+
162
+ @property
163
+ def name(self) -> str:
164
+ return self.func.__name__
165
+
166
+ @property
167
+ def doc(self) -> FeatureDoc:
168
+ description = self.func.__doc__
169
+ assert description is not None, self.func.__name__
170
+ return FeatureDoc(
171
+ description=description,
172
+ coefficient=self.weight,
173
+ url=make_github_url(self.func),
174
+ )
175
+
176
+ def invoke(self, query: E, result: E, config: ScoringConfig) -> FtResult:
177
+ """Invoke the feature function and return the result."""
178
+ if self.func.__code__.co_argcount == 3:
179
+ func = cast(FeatureCompareConfigured, self.func)
180
+ return func(query, result, config)
181
+ else:
182
+ func = cast(FeatureCompareFunction, self.func) # type: ignore
183
+ return func(query, result) # type: ignore
184
+
185
+
186
+ class HeuristicAlgorithm(ScoringAlgorithm):
187
+ features: List[Feature]
188
+
189
+ @classmethod
190
+ def compute_score(
191
+ cls, scores: Dict[str, float], weights: Dict[str, float]
192
+ ) -> float:
193
+ raise NotImplementedError
194
+
195
+ @classmethod
196
+ def get_feature_docs(cls) -> FeatureDocs:
197
+ return {f.name: f.doc for f in cls.features}
198
+
199
+ @classmethod
200
+ def default_config(cls) -> ScoringConfig:
201
+ """Return the default configuration for the algorithm."""
202
+ config = ScoringConfig.defaults()
203
+ for name, var in cls.CONFIG.items():
204
+ config.config[name] = var.default
205
+ return config
206
+
207
+ @classmethod
208
+ def compare(cls, query: E, result: E, config: ScoringConfig) -> MatchingResult:
209
+ if not query.schema.can_match(result.schema):
210
+ if not query.schema.name == result.schema.name:
211
+ return MatchingResult.make(FNUL, {})
212
+
213
+ for name, var in cls.CONFIG.items():
214
+ if config.config.get(name) is None:
215
+ config.config[name] = var.default
216
+
217
+ explanations: Dict[str, FtResult] = {}
218
+ scores: Dict[str, float] = {}
219
+ weights: Dict[str, float] = {}
220
+ for feature in cls.features:
221
+ weights[feature.name] = config.weights.get(feature.name, feature.weight)
222
+ if weights[feature.name] != FNUL:
223
+ explanations[feature.name] = feature.invoke(query, result, config)
224
+ scores[feature.name] = explanations[feature.name].score
225
+ score = cls.compute_score(scores, weights)
226
+ score = min(1.0, max(FNUL, score))
227
+ return MatchingResult.make(score=score, explanations=explanations)
@@ -0,0 +1,62 @@
1
+ from pathlib import Path
2
+ from itertools import product
3
+ from typing import List, Set, TypeVar, Tuple, Iterable, Optional, Callable, Any
4
+ from followthemoney.proxy import E
5
+ from followthemoney.types.common import PropertyType
6
+
7
+ from nomenklatura import __version__
8
+ from nomenklatura.util import DATA_PATH
9
+
10
+ V = TypeVar("V")
11
+ BASE_URL = "https://github.com/opensanctions/nomenklatura/blob/%s/nomenklatura/%s#L%s"
12
+ CODE_PATH = DATA_PATH.joinpath("..").resolve()
13
+ FNUL = 0.0
14
+
15
+
16
+ def has_schema(left: E, right: E, schema: str) -> bool:
17
+ """Check if one of the entities has the required schema."""
18
+ if left.schema.is_a(schema) or right.schema.is_a(schema):
19
+ return True
20
+ return False
21
+
22
+
23
+ def props_pair(left: E, right: E, props: List[str]) -> Tuple[Set[str], Set[str]]:
24
+ left_values: Set[str] = set()
25
+ right_values: Set[str] = set()
26
+ for prop in props:
27
+ left_values.update(left.get(prop, quiet=True))
28
+ right_values.update(right.get(prop, quiet=True))
29
+ return left_values, right_values
30
+
31
+
32
+ def type_pair(left: E, right: E, type_: PropertyType) -> Tuple[List[str], List[str]]:
33
+ left_values = left.get_type_values(type_, matchable=True)
34
+ right_values = right.get_type_values(type_, matchable=True)
35
+ return left_values, right_values
36
+
37
+
38
+ def max_in_sets(
39
+ left: Iterable[Optional[V]],
40
+ right: Iterable[Optional[V]],
41
+ compare_func: Callable[[V, V], float],
42
+ max_res: float = 1.0,
43
+ ) -> float:
44
+ """Compare two sets of values pair-wise and select the highest-scored result."""
45
+ res: float = 0.0
46
+ for le, ri in product(left, right):
47
+ if le is None or ri is None:
48
+ continue
49
+ v = compare_func(le, ri)
50
+ if v <= res:
51
+ continue
52
+ res = v
53
+ if res >= max_res:
54
+ return res
55
+ return res
56
+
57
+
58
+ def make_github_url(func: Callable[..., Any]) -> str:
59
+ """Make a URL to the source code of a matching function."""
60
+ code_path = Path(func.__code__.co_filename).relative_to(CODE_PATH)
61
+ line_no = func.__code__.co_firstlineno
62
+ return BASE_URL % (__version__, code_path, line_no)
File without changes
@@ -0,0 +1,49 @@
1
+ from functools import cache
2
+ from typing import Iterable, List, Tuple
3
+ from followthemoney import registry, SE
4
+
5
+
6
+ PROV_MIN_DATES = ("createdAt", "authoredAt", "publishedAt")
7
+ PROV_MAX_DATES = ("modifiedAt", "retrievedAt")
8
+
9
+
10
+ def simplify_dates(entity: SE) -> SE:
11
+ """If an entity has multiple values for a date field, you may
12
+ want to remove all those that are prefixes of others. For example,
13
+ if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
14
+ want to drop the mention of 1990."""
15
+ for prop in entity.iterprops():
16
+ if prop.type == registry.date:
17
+ # This is super unrolled in order to make it fast, its called
18
+ # a lot during data exports. We shouldn't re-use this function
19
+ # code in less perf critical contexts.
20
+ stmts = entity._statements[prop.name]
21
+ if len(stmts) < 2:
22
+ continue
23
+ values_in = tuple({s.value for s in stmts})
24
+ if len(values_in) < 2:
25
+ continue
26
+ values = remove_prefix_date_values(values_in)
27
+ if prop.name in PROV_MAX_DATES:
28
+ values = (max(values),)
29
+ elif prop.name in PROV_MIN_DATES:
30
+ values = (min(values),)
31
+
32
+ for stmt in list(stmts):
33
+ if stmt.value not in values:
34
+ entity._statements[prop.name].remove(stmt)
35
+ return entity
36
+
37
+
38
+ @cache
39
+ def remove_prefix_date_values(values: Tuple[str]) -> Iterable[str]:
40
+ """See ``remove_prefix_dates``."""
41
+ kept: List[str] = []
42
+ values_list = sorted(values, reverse=True)
43
+ for index, value in enumerate(values_list):
44
+ if index > 0:
45
+ longer = values_list[index - 1]
46
+ if longer.startswith(value):
47
+ continue
48
+ kept.append(value)
49
+ return kept
@@ -0,0 +1,32 @@
1
+ from followthemoney import SE
2
+ from nomenklatura.resolver import Identifier
3
+
4
+
5
+ def simplify_undirected(entity: SE) -> SE:
6
+ """Simplify undirected edges by removing duplicate entity IDs on both
7
+ ends."""
8
+ # Problem: undirected relationships in which both
9
+ # entities are given as the source AND the target
10
+ if (
11
+ not entity.schema.edge
12
+ or entity.schema.edge_directed
13
+ or not entity.schema.edge_source
14
+ or not entity.schema.edge_target
15
+ ):
16
+ return entity
17
+ sources = entity.get_statements(entity.schema.edge_source)
18
+ targets = entity.get_statements(entity.schema.edge_target)
19
+ source_ids = set((s.value for s in sources))
20
+ target_ids = set((t.value for t in targets))
21
+ common = source_ids.intersection(target_ids)
22
+ if len(common) != 2:
23
+ return entity
24
+ identifiers = [Identifier.get(s) for s in common]
25
+ source_id, target_id = max(identifiers), min(identifiers)
26
+ for stmt in sources:
27
+ if stmt.value == target_id:
28
+ entity._statements[entity.schema.edge_source].remove(stmt)
29
+ for stmt in targets:
30
+ if stmt.value == source_id:
31
+ entity._statements[entity.schema.edge_target].remove(stmt)
32
+ return entity
nomenklatura/py.typed ADDED
File without changes
@@ -0,0 +1,6 @@
1
+ from nomenklatura.resolver.identifier import Identifier, StrIdent, Pair
2
+ from nomenklatura.resolver.edge import Edge
3
+ from nomenklatura.resolver.linker import Linker
4
+ from nomenklatura.resolver.resolver import Resolver
5
+
6
+ __all__ = ["Identifier", "StrIdent", "Pair", "Edge", "Linker", "Resolver"]
@@ -0,0 +1,2 @@
1
+ class ResolverLogicError(Exception):
2
+ pass
@@ -0,0 +1,107 @@
1
+ import json
2
+ from typing import Any, Dict, Optional, Union
3
+
4
+ from sqlalchemy.engine import RowMapping
5
+
6
+ from nomenklatura.judgement import Judgement
7
+ from nomenklatura.resolver.identifier import Identifier, StrIdent
8
+
9
+
10
+ class Edge(object):
11
+ __slots__ = (
12
+ "key",
13
+ "source",
14
+ "target",
15
+ "judgement",
16
+ "score",
17
+ "user",
18
+ "created_at",
19
+ "deleted_at",
20
+ )
21
+
22
+ def __init__(
23
+ self,
24
+ left_id: StrIdent,
25
+ right_id: StrIdent,
26
+ judgement: Judgement = Judgement.NO_JUDGEMENT,
27
+ score: Optional[float] = None,
28
+ user: Optional[str] = None,
29
+ created_at: Optional[str] = None,
30
+ deleted_at: Optional[str] = None,
31
+ ):
32
+ self.key = Identifier.pair(left_id, right_id)
33
+ self.target, self.source = self.key
34
+ self.judgement = judgement
35
+ self.score = score
36
+ self.user = user
37
+ self.created_at = created_at
38
+ self.deleted_at = deleted_at
39
+
40
+ def other(self, cur: Identifier) -> Identifier:
41
+ if cur == self.target:
42
+ return self.source
43
+ return self.target
44
+
45
+ def to_dict(self) -> Dict[str, Any]:
46
+ return {
47
+ "target": self.target.id,
48
+ "source": self.source.id,
49
+ "judgement": self.judgement.value,
50
+ "score": self.score,
51
+ "user": self.user,
52
+ "created_at": self.created_at,
53
+ "deleted_at": self.deleted_at,
54
+ }
55
+
56
+ def to_line(self) -> str:
57
+ row = [
58
+ self.target.id,
59
+ self.source.id,
60
+ self.judgement.value,
61
+ self.score,
62
+ self.user,
63
+ self.created_at,
64
+ ]
65
+ return json.dumps(row) + "\n"
66
+
67
+ def __str__(self) -> str:
68
+ return self.to_line()
69
+
70
+ def __hash__(self) -> int:
71
+ return hash(self.key)
72
+
73
+ def __eq__(self, other: Any) -> bool:
74
+ return hash(self) == hash(other)
75
+
76
+ def __lt__(self, other: Any) -> bool:
77
+ return bool(self.key < other.key)
78
+
79
+ def __repr__(self) -> str:
80
+ return f"<E({self.target.id}, {self.source.id}, {self.judgement.value})>"
81
+
82
+ @classmethod
83
+ def from_line(cls, line: str) -> "Edge":
84
+ data = json.loads(line)
85
+ edge = cls(
86
+ data[0],
87
+ data[1],
88
+ judgement=Judgement(data[2]),
89
+ score=data[3],
90
+ user=data[4],
91
+ created_at=data[5],
92
+ )
93
+ if len(data) > 6:
94
+ edge.deleted_at = data[6]
95
+ return edge
96
+
97
+ @classmethod
98
+ def from_dict(cls, data: Union[RowMapping, Dict[str, Any]]) -> "Edge":
99
+ return cls(
100
+ left_id=data["target"],
101
+ right_id=data["source"],
102
+ judgement=Judgement(data["judgement"]),
103
+ score=data["score"],
104
+ user=data["user"],
105
+ created_at=data.get("created_at"),
106
+ deleted_at=data.get("deleted_at"),
107
+ )
@@ -0,0 +1,60 @@
1
+ import shortuuid
2
+ from typing import Any, Optional, Tuple, Union
3
+ from rigour.ids.wikidata import is_qid
4
+
5
+ from nomenklatura.resolver.common import ResolverLogicError
6
+
7
+ StrIdent = Union[str, "Identifier"]
8
+ Pair = Tuple["Identifier", "Identifier"]
9
+
10
+
11
+ class Identifier(object):
12
+ PREFIX = "NK-"
13
+
14
+ __slots__ = ("id", "canonical", "weight")
15
+
16
+ def __init__(self, id: str):
17
+ self.id = id
18
+ self.weight: int = 1
19
+ if self.id.startswith(self.PREFIX):
20
+ self.weight = 2
21
+ elif is_qid(id):
22
+ self.weight = 3
23
+ self.canonical = self.weight > 1
24
+
25
+ def __eq__(self, other: Any) -> bool:
26
+ return self.id == str(other)
27
+
28
+ def __lt__(self, other: Any) -> bool:
29
+ return (self.weight, self.id) < (other.weight, other.id)
30
+
31
+ def __str__(self) -> str:
32
+ return self.id
33
+
34
+ def __hash__(self) -> int:
35
+ return hash(self.id)
36
+
37
+ def __len__(self) -> int:
38
+ return len(self.id)
39
+
40
+ def __repr__(self) -> str:
41
+ return f"<I({self.id})>"
42
+
43
+ @classmethod
44
+ def get(cls, id: StrIdent) -> "Identifier":
45
+ if isinstance(id, str):
46
+ return cls(id)
47
+ return id
48
+
49
+ @classmethod
50
+ def pair(cls, left_id: StrIdent, right_id: StrIdent) -> Pair:
51
+ left = cls.get(left_id)
52
+ right = cls.get(right_id)
53
+ if left == right:
54
+ raise ResolverLogicError("%s/%s" % (left, right))
55
+ return (max(left, right), min(left, right))
56
+
57
+ @classmethod
58
+ def make(cls, value: Optional[str] = None) -> "Identifier":
59
+ key = value or shortuuid.uuid()
60
+ return cls.get(f"{cls.PREFIX}{key}")
@@ -0,0 +1,101 @@
1
+ from functools import lru_cache
2
+ from typing import Dict, Generator, Generic, Set
3
+ from followthemoney import registry, ValueEntity, Statement, SE
4
+
5
+ from nomenklatura.resolver.identifier import Identifier, StrIdent
6
+
7
+
8
+ class Linker(Generic[SE]):
9
+ """A class to manage the canonicalisation of entities. This stores only the positive
10
+ merges of entities and is used as a lightweight way to apply the harmonisation
11
+ post de-duplication."""
12
+
13
+ def __init__(self, entities: Dict[Identifier, Set[Identifier]] = {}) -> None:
14
+ """
15
+ Args:
16
+ entities: an entry for each entity with its connected set of entities.
17
+ """
18
+ self._entities: Dict[Identifier, Set[Identifier]] = entities
19
+
20
+ def connected(self, node: Identifier) -> Set[Identifier]:
21
+ return self._entities.get(node, set([node]))
22
+
23
+ @lru_cache(maxsize=1024)
24
+ def get_canonical(self, entity_id: StrIdent) -> str:
25
+ """Return the canonical identifier for the given entity ID."""
26
+ node = Identifier.get(entity_id)
27
+ best = max(self.connected(node))
28
+ if best.canonical:
29
+ return best.id
30
+ return node.id
31
+
32
+ def canonicals(self) -> Generator[Identifier, None, None]:
33
+ """Return all the canonical cluster identifiers."""
34
+ for node in self._entities.keys():
35
+ if not node.canonical:
36
+ continue
37
+ canonical = self.get_canonical(node)
38
+ if canonical == node.id:
39
+ yield node
40
+
41
+ def get_referents(
42
+ self, canonical_id: StrIdent, canonicals: bool = True
43
+ ) -> Set[str]:
44
+ """Get all the non-canonical entity identifiers which refer to a given
45
+ canonical identifier."""
46
+ node = Identifier.get(canonical_id)
47
+ referents: Set[str] = set()
48
+ for connected in self.connected(node):
49
+ if not canonicals and connected.canonical:
50
+ continue
51
+ if connected == node:
52
+ continue
53
+ referents.add(connected.id)
54
+ return referents
55
+
56
+ def apply(self, proxy: SE) -> SE:
57
+ """Replace all entity references in a given proxy with their canonical
58
+ identifiers. This is essentially the harmonisation post de-dupe."""
59
+ if proxy.id is None:
60
+ return proxy
61
+ proxy.id = self.get_canonical(proxy.id)
62
+ return self.apply_properties(proxy)
63
+
64
+ def apply_stream(self, proxy: ValueEntity) -> ValueEntity:
65
+ if proxy.id is None:
66
+ return proxy
67
+ proxy.id = self.get_canonical(proxy.id)
68
+ for prop in proxy.iterprops():
69
+ if prop.type == registry.entity:
70
+ values = proxy.pop(prop)
71
+ for value in values:
72
+ proxy.unsafe_add(prop, self.get_canonical(value), cleaned=True)
73
+ return proxy
74
+
75
+ def apply_properties(self, proxy: SE) -> SE:
76
+ for stmt in proxy._iter_stmt():
77
+ if proxy.id is not None:
78
+ stmt.canonical_id = proxy.id
79
+ if stmt.prop_type == registry.entity.name:
80
+ canon_value = self.get_canonical(stmt.value)
81
+ if canon_value != stmt.value:
82
+ if stmt.original_value is None:
83
+ stmt.original_value = stmt.value
84
+ # NOTE: this means the key is out of whack here now
85
+ stmt.value = canon_value
86
+ return proxy
87
+
88
+ def apply_statement(self, stmt: Statement) -> Statement:
89
+ if stmt.entity_id is not None:
90
+ stmt.canonical_id = self.get_canonical(stmt.entity_id)
91
+ if stmt.prop_type == registry.entity.name:
92
+ canon_value = self.get_canonical(stmt.value)
93
+ if canon_value != stmt.value:
94
+ if stmt.original_value is None:
95
+ stmt.original_value = stmt.value
96
+ # NOTE: this means the key is out of whack here now
97
+ stmt.value = canon_value
98
+ return stmt
99
+
100
+ def __repr__(self) -> str:
101
+ return f"<Merger({len(self._entities)})>"