nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,565 @@
|
|
1
|
+
#
|
2
|
+
# Don't forget to call self._invalidate from methods that modify edges.
|
3
|
+
#
|
4
|
+
import getpass
|
5
|
+
import logging
|
6
|
+
from collections import defaultdict
|
7
|
+
from functools import lru_cache
|
8
|
+
from typing import Any, Dict, Generator, List, Optional, Set, Tuple
|
9
|
+
from rigour.ids.wikidata import is_qid
|
10
|
+
from rigour.time import utc_now
|
11
|
+
from sqlalchemy import (
|
12
|
+
Column,
|
13
|
+
Float,
|
14
|
+
Index,
|
15
|
+
Integer,
|
16
|
+
MetaData,
|
17
|
+
Table,
|
18
|
+
Unicode,
|
19
|
+
or_,
|
20
|
+
text,
|
21
|
+
)
|
22
|
+
from sqlalchemy.engine import Connection, Engine, Transaction
|
23
|
+
from sqlalchemy.sql.expression import delete, insert, update
|
24
|
+
from followthemoney import registry, Statement, SE
|
25
|
+
from followthemoney.util import PathLike
|
26
|
+
|
27
|
+
from nomenklatura.db import get_engine
|
28
|
+
from nomenklatura.judgement import Judgement
|
29
|
+
from nomenklatura.resolver.edge import Edge
|
30
|
+
from nomenklatura.resolver.identifier import Identifier, Pair, StrIdent
|
31
|
+
from nomenklatura.resolver.linker import Linker
|
32
|
+
|
33
|
+
|
34
|
+
log = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
def timestamp() -> str:
|
38
|
+
return utc_now().isoformat()[:28]
|
39
|
+
|
40
|
+
|
41
|
+
class Resolver(Linker[SE]):
|
42
|
+
UNDECIDED = (Judgement.NO_JUDGEMENT, Judgement.UNSURE)
|
43
|
+
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
engine: Engine,
|
47
|
+
metadata: MetaData,
|
48
|
+
create: bool = False,
|
49
|
+
table_name: str = "resolver",
|
50
|
+
) -> None:
|
51
|
+
self._engine = engine
|
52
|
+
self._conn: Optional[Connection] = None
|
53
|
+
self._transaction: Optional[Transaction] = None
|
54
|
+
# Start with None to skip deletes on first BEGIN.
|
55
|
+
# We don't have to process deletes to represent the state on first load.
|
56
|
+
self._max_ts: Optional[str] = None
|
57
|
+
self.edges: Dict[Pair, Edge] = {}
|
58
|
+
self.nodes: Dict[Identifier, Set[Edge]] = defaultdict(set)
|
59
|
+
|
60
|
+
unique_kw: Dict[str, Any] = {"unique": True}
|
61
|
+
if engine.dialect.name == "sqlite":
|
62
|
+
unique_kw["sqlite_where"] = text("deleted_at IS NULL")
|
63
|
+
if engine.dialect.name in ("postgresql", "postgres"):
|
64
|
+
unique_kw["postgresql_where"] = text("deleted_at IS NULL")
|
65
|
+
unique_pair = Index(
|
66
|
+
f"{table_name}_source_target_uniq",
|
67
|
+
text("source"),
|
68
|
+
text("target"),
|
69
|
+
**unique_kw,
|
70
|
+
)
|
71
|
+
self._table = Table(
|
72
|
+
table_name,
|
73
|
+
metadata,
|
74
|
+
Column("id", Integer(), primary_key=True),
|
75
|
+
Column("target", Unicode(512), index=True),
|
76
|
+
Column("source", Unicode(512), index=True),
|
77
|
+
Column("judgement", Unicode(14), nullable=False),
|
78
|
+
Column("score", Float, nullable=True),
|
79
|
+
Column("user", Unicode(512), nullable=False),
|
80
|
+
Column("created_at", Unicode(28)),
|
81
|
+
Column("deleted_at", Unicode(28), nullable=True),
|
82
|
+
unique_pair,
|
83
|
+
extend_existing=True,
|
84
|
+
)
|
85
|
+
if create:
|
86
|
+
metadata.create_all(bind=engine, checkfirst=True, tables=[self._table])
|
87
|
+
|
88
|
+
def _update_from_db(self) -> None:
|
89
|
+
"""Apply new deletes and unseen edges from the database."""
|
90
|
+
stmt = self._table.select()
|
91
|
+
if self._max_ts is None:
|
92
|
+
stmt = stmt.where(self._table.c.deleted_at.is_(None))
|
93
|
+
else:
|
94
|
+
stmt = stmt.where(
|
95
|
+
or_(
|
96
|
+
self._table.c.deleted_at > self._max_ts,
|
97
|
+
self._table.c.created_at > self._max_ts,
|
98
|
+
)
|
99
|
+
)
|
100
|
+
stmt.order_by(self._table.c.deleted_at.asc().nulls_last())
|
101
|
+
stmt.order_by(self._table.c.created_at.asc())
|
102
|
+
cursor = self._get_connection().execute(stmt)
|
103
|
+
while batch := cursor.fetchmany(10000):
|
104
|
+
for row in batch:
|
105
|
+
edge = Edge.from_dict(row._mapping)
|
106
|
+
if self._max_ts is None:
|
107
|
+
self._max_ts = edge.created_at
|
108
|
+
if self._max_ts is not None:
|
109
|
+
if edge.created_at is not None:
|
110
|
+
self._max_ts = max(self._max_ts, edge.created_at)
|
111
|
+
if edge.deleted_at is not None:
|
112
|
+
self._max_ts = max(self._max_ts, edge.deleted_at)
|
113
|
+
self._update_edge(edge)
|
114
|
+
cursor.close()
|
115
|
+
|
116
|
+
def _update_edge(self, edge: Edge) -> None:
|
117
|
+
if edge.deleted_at is None:
|
118
|
+
if edge.judgement != Judgement.NO_JUDGEMENT:
|
119
|
+
edge.score = None
|
120
|
+
self.edges[edge.key] = edge
|
121
|
+
self.nodes[edge.source].add(edge)
|
122
|
+
self.nodes[edge.target].add(edge)
|
123
|
+
else:
|
124
|
+
self.edges.pop(edge.key, None)
|
125
|
+
for node in (edge.source, edge.target):
|
126
|
+
if node in self.nodes:
|
127
|
+
self.nodes[node].discard(edge)
|
128
|
+
if len(self.nodes[node]) == 0:
|
129
|
+
del self.nodes[node]
|
130
|
+
|
131
|
+
@classmethod
|
132
|
+
def make_default(cls, engine: Optional[Engine] = None) -> "Resolver[SE]":
|
133
|
+
if engine is None:
|
134
|
+
engine = get_engine()
|
135
|
+
meta = MetaData()
|
136
|
+
return cls(engine, meta, create=True)
|
137
|
+
|
138
|
+
def _invalidate(self) -> None:
|
139
|
+
self.connected.cache_clear()
|
140
|
+
self.get_canonical.cache_clear()
|
141
|
+
|
142
|
+
def begin(self) -> None:
|
143
|
+
"""
|
144
|
+
Start a new transaction in Begin Once style. Callers are responsible for
|
145
|
+
committing or rolling back the transaction.
|
146
|
+
|
147
|
+
https://docs.sqlalchemy.org/en/20/core/connections.html#begin-once
|
148
|
+
"""
|
149
|
+
if self._conn is None:
|
150
|
+
self._conn = self._engine.connect()
|
151
|
+
if self._transaction is None:
|
152
|
+
self._transaction = self._conn.begin()
|
153
|
+
self._update_from_db()
|
154
|
+
self._invalidate()
|
155
|
+
|
156
|
+
def commit(self) -> None:
|
157
|
+
if self._transaction is None or self._conn is None:
|
158
|
+
self._transaction = None
|
159
|
+
self._conn = None
|
160
|
+
return
|
161
|
+
|
162
|
+
# Swipe up all NO JUDGEMENT edges that have been deleted:
|
163
|
+
clean_stmt = delete(self._table)
|
164
|
+
clean_stmt = clean_stmt.where(
|
165
|
+
self._table.c.judgement == Judgement.NO_JUDGEMENT.value
|
166
|
+
)
|
167
|
+
clean_stmt = clean_stmt.where(self._table.c.deleted_at.is_not(None))
|
168
|
+
self._conn.execute(clean_stmt)
|
169
|
+
|
170
|
+
self._transaction.commit()
|
171
|
+
self._transaction = None
|
172
|
+
self._conn.close()
|
173
|
+
self._conn = None
|
174
|
+
|
175
|
+
def rollback(self) -> None:
|
176
|
+
if self._transaction is not None:
|
177
|
+
self._transaction.rollback()
|
178
|
+
self._transaction = None
|
179
|
+
if self._conn is not None:
|
180
|
+
self._conn.close()
|
181
|
+
self._conn = None
|
182
|
+
|
183
|
+
def close(self) -> None:
|
184
|
+
"""Close the resolver connection."""
|
185
|
+
if self._transaction is not None:
|
186
|
+
self._transaction.rollback()
|
187
|
+
self._transaction = None
|
188
|
+
if self._conn is not None:
|
189
|
+
self._conn.close()
|
190
|
+
self._conn = None
|
191
|
+
self.edges.clear()
|
192
|
+
self.nodes.clear()
|
193
|
+
self._max_ts = None
|
194
|
+
self._invalidate()
|
195
|
+
|
196
|
+
def _get_connection(self) -> Connection:
|
197
|
+
if self._transaction is None or self._conn is None:
|
198
|
+
raise RuntimeError("No transaction in progress.")
|
199
|
+
return self._conn
|
200
|
+
|
201
|
+
def get_linker(self) -> Linker[SE]:
|
202
|
+
"""Return a linker object that can be used to resolve entities.
|
203
|
+
This is less memory-consuming than the full resolver object.
|
204
|
+
"""
|
205
|
+
entities: Dict[Identifier, Set[Identifier]] = {}
|
206
|
+
stmt = self._table.select()
|
207
|
+
stmt = stmt.where(self._table.c.judgement == Judgement.POSITIVE.value)
|
208
|
+
stmt = stmt.where(self._table.c.deleted_at.is_(None))
|
209
|
+
stmt.order_by(self._table.c.created_at.asc())
|
210
|
+
with self._engine.connect() as conn:
|
211
|
+
cursor = conn.execute(stmt)
|
212
|
+
while batch := cursor.fetchmany(20000):
|
213
|
+
for row in batch:
|
214
|
+
edge = Edge.from_dict(row._mapping)
|
215
|
+
cluster = entities.get(edge.source)
|
216
|
+
if cluster is None:
|
217
|
+
cluster = set([edge.source])
|
218
|
+
other = entities.get(edge.target)
|
219
|
+
if other is None:
|
220
|
+
other = set([edge.target])
|
221
|
+
cluster.update(other)
|
222
|
+
for node in cluster:
|
223
|
+
entities[node] = cluster
|
224
|
+
cursor.close()
|
225
|
+
return Linker(entities)
|
226
|
+
|
227
|
+
def get_edge(self, left_id: StrIdent, right_id: StrIdent) -> Optional[Edge]:
|
228
|
+
key = Identifier.pair(left_id, right_id)
|
229
|
+
return self.edges.get(key)
|
230
|
+
|
231
|
+
def _traverse(self, node: Identifier, seen: Set[Identifier]) -> Set[Identifier]:
|
232
|
+
"""Returns the set of nodes connected to the given node via positive judgement."""
|
233
|
+
connected = set([node])
|
234
|
+
if node in seen:
|
235
|
+
return connected
|
236
|
+
seen.add(node)
|
237
|
+
for edge in self.nodes.get(node, []):
|
238
|
+
if edge.judgement == Judgement.POSITIVE:
|
239
|
+
other = edge.other(node)
|
240
|
+
rec = self._traverse(other, seen)
|
241
|
+
connected.update(rec)
|
242
|
+
return connected
|
243
|
+
|
244
|
+
@lru_cache(maxsize=200000)
|
245
|
+
def connected(self, node: Identifier) -> Set[Identifier]:
|
246
|
+
return self._traverse(node, set())
|
247
|
+
|
248
|
+
@lru_cache(maxsize=200000)
|
249
|
+
def get_canonical(self, entity_id: StrIdent) -> str:
|
250
|
+
"""Return the canonical identifier for the given entity ID."""
|
251
|
+
node = Identifier.get(entity_id)
|
252
|
+
max_ = max(self.connected(node))
|
253
|
+
if max_.canonical:
|
254
|
+
return max_.id
|
255
|
+
return node.id
|
256
|
+
|
257
|
+
def canonicals(self) -> Generator[Identifier, None, None]:
|
258
|
+
"""Return all the canonical cluster identifiers."""
|
259
|
+
for node in self.nodes.keys():
|
260
|
+
if not node.canonical:
|
261
|
+
continue
|
262
|
+
canonical = self.get_canonical(node)
|
263
|
+
if canonical == node.id:
|
264
|
+
yield node
|
265
|
+
|
266
|
+
def get_referents(
|
267
|
+
self, canonical_id: StrIdent, canonicals: bool = True
|
268
|
+
) -> Set[str]:
|
269
|
+
"""Get all the non-canonical entity identifiers which refer to a given
|
270
|
+
canonical identifier."""
|
271
|
+
node = Identifier.get(canonical_id)
|
272
|
+
referents: Set[str] = set()
|
273
|
+
for connected in self.connected(node):
|
274
|
+
if not canonicals and connected.canonical:
|
275
|
+
continue
|
276
|
+
if connected == node:
|
277
|
+
continue
|
278
|
+
referents.add(connected.id)
|
279
|
+
return referents
|
280
|
+
|
281
|
+
def get_resolved_edge(
|
282
|
+
self, left_id: StrIdent, right_id: StrIdent
|
283
|
+
) -> Optional[Edge]:
|
284
|
+
"""
|
285
|
+
Return _some_ edge that connects the two entities, if it exists.
|
286
|
+
"""
|
287
|
+
(left, right) = Identifier.pair(left_id, right_id)
|
288
|
+
left_connected = self.connected(left)
|
289
|
+
right_connected = self.connected(right)
|
290
|
+
for e in left_connected:
|
291
|
+
for o in right_connected:
|
292
|
+
if e == o:
|
293
|
+
continue
|
294
|
+
edge = self.edges.get(Identifier.pair(e, o))
|
295
|
+
if edge is None:
|
296
|
+
continue
|
297
|
+
return edge
|
298
|
+
return None
|
299
|
+
|
300
|
+
def _pair_judgement(self, left: Identifier, right: Identifier) -> Judgement:
|
301
|
+
edge = self.get_edge(left, right)
|
302
|
+
if edge is not None:
|
303
|
+
return edge.judgement
|
304
|
+
return Judgement.NO_JUDGEMENT
|
305
|
+
|
306
|
+
def get_judgement(self, entity_id: StrIdent, other_id: StrIdent) -> Judgement:
|
307
|
+
"""Get the existing decision between two entities with dedupe factored in."""
|
308
|
+
entity = Identifier.get(entity_id)
|
309
|
+
other = Identifier.get(other_id)
|
310
|
+
if entity == other:
|
311
|
+
return Judgement.POSITIVE
|
312
|
+
entity_connected = self.connected(entity)
|
313
|
+
if other in entity_connected:
|
314
|
+
return Judgement.POSITIVE
|
315
|
+
# Check QIDs after connected because we sometimes insert an edge to say
|
316
|
+
# one QID is canonical for another. Not common but important.
|
317
|
+
if is_qid(entity.id) and is_qid(other.id):
|
318
|
+
return Judgement.NEGATIVE
|
319
|
+
|
320
|
+
# HACK: this would mark pairs only as unsure if the unsure judgement
|
321
|
+
# had been made on the current canonical combination:
|
322
|
+
# canon_edge = self._pair_judgement(max(entity_connected), max(other_connected))
|
323
|
+
# if canon_edge == Judgement.UNSURE:
|
324
|
+
# return Judgement.UNSURE
|
325
|
+
|
326
|
+
other_connected = self.connected(other)
|
327
|
+
for e in entity_connected:
|
328
|
+
for o in other_connected:
|
329
|
+
judgement = self._pair_judgement(e, o)
|
330
|
+
if judgement != Judgement.NO_JUDGEMENT:
|
331
|
+
return judgement
|
332
|
+
|
333
|
+
return Judgement.NO_JUDGEMENT
|
334
|
+
|
335
|
+
def check_candidate(self, left: StrIdent, right: StrIdent) -> bool:
|
336
|
+
"""Check if the two IDs could be merged, i.e. if there's no existing
|
337
|
+
judgement."""
|
338
|
+
judgement = self.get_judgement(left, right)
|
339
|
+
return judgement == Judgement.NO_JUDGEMENT
|
340
|
+
|
341
|
+
def get_judgements(
|
342
|
+
self, limit: Optional[int] = None
|
343
|
+
) -> Generator[Edge, None, None]:
|
344
|
+
"""Get most recently updated edges other than NO_JUDGEMENT."""
|
345
|
+
stmt = self._table.select()
|
346
|
+
stmt = stmt.where(self._table.c.judgement != Judgement.NO_JUDGEMENT.value)
|
347
|
+
stmt = stmt.where(self._table.c.deleted_at.is_(None))
|
348
|
+
stmt = stmt.order_by(self._table.c.created_at.desc())
|
349
|
+
if limit is not None:
|
350
|
+
stmt = stmt.limit(limit)
|
351
|
+
cursor = self._get_connection().execute(stmt)
|
352
|
+
while batch := cursor.fetchmany(25):
|
353
|
+
for row in batch:
|
354
|
+
yield Edge.from_dict(row._mapping)
|
355
|
+
cursor.close()
|
356
|
+
|
357
|
+
def _get_suggested(self) -> List[Edge]:
|
358
|
+
"""Get all NO_JUDGEMENT edges in descending order of score."""
|
359
|
+
edges_all = self.edges.values()
|
360
|
+
candidates = (e for e in edges_all if e.judgement == Judgement.NO_JUDGEMENT)
|
361
|
+
cmp = lambda x: x.score or -1.0 # noqa
|
362
|
+
return sorted(candidates, key=cmp, reverse=True)
|
363
|
+
|
364
|
+
def get_candidates(
|
365
|
+
self, limit: Optional[int] = None
|
366
|
+
) -> Generator[Tuple[str, str, Optional[float]], None, None]:
|
367
|
+
returned = 0
|
368
|
+
for edge in self._get_suggested():
|
369
|
+
if not self.check_candidate(edge.source, edge.target):
|
370
|
+
continue
|
371
|
+
yield edge.target.id, edge.source.id, edge.score
|
372
|
+
returned += 1
|
373
|
+
if limit is not None and returned >= limit:
|
374
|
+
break
|
375
|
+
|
376
|
+
def suggest(
|
377
|
+
self,
|
378
|
+
left_id: StrIdent,
|
379
|
+
right_id: StrIdent,
|
380
|
+
score: float,
|
381
|
+
user: Optional[str] = None,
|
382
|
+
) -> Identifier:
|
383
|
+
"""Make a NO_JUDGEMENT link between two identifiers to suggest that a user
|
384
|
+
should make a decision about whether they are the same or not."""
|
385
|
+
edge = self.get_edge(left_id, right_id)
|
386
|
+
if edge is not None:
|
387
|
+
if edge.judgement == Judgement.NO_JUDGEMENT:
|
388
|
+
# Just update score
|
389
|
+
|
390
|
+
# database
|
391
|
+
stmt = update(self._table)
|
392
|
+
stmt = stmt.where(self._table.c.target == edge.target.id)
|
393
|
+
stmt = stmt.where(self._table.c.source == edge.source.id)
|
394
|
+
stmt = stmt.where(self._table.c.deleted_at.is_(None))
|
395
|
+
stmt = stmt.where(
|
396
|
+
self._table.c.judgement == Judgement.NO_JUDGEMENT.value
|
397
|
+
)
|
398
|
+
stmt = stmt.values({"score": score})
|
399
|
+
self._get_connection().execute(stmt)
|
400
|
+
|
401
|
+
# local state
|
402
|
+
edge.score = score
|
403
|
+
return edge.target
|
404
|
+
return self.decide(
|
405
|
+
left_id, right_id, Judgement.NO_JUDGEMENT, score=score, user=user
|
406
|
+
)
|
407
|
+
|
408
|
+
def decide(
|
409
|
+
self,
|
410
|
+
left_id: StrIdent,
|
411
|
+
right_id: StrIdent,
|
412
|
+
judgement: Judgement,
|
413
|
+
user: Optional[str] = None,
|
414
|
+
score: Optional[float] = None,
|
415
|
+
) -> Identifier:
|
416
|
+
edge = self.get_edge(left_id, right_id)
|
417
|
+
if edge is None:
|
418
|
+
edge = Edge(left_id, right_id, judgement=judgement)
|
419
|
+
|
420
|
+
# Canonicalise positive matches, i.e. make both identifiers refer to a
|
421
|
+
# canonical identifier, instead of making a direct link.
|
422
|
+
if judgement == Judgement.POSITIVE:
|
423
|
+
connected = set(self.connected(edge.target))
|
424
|
+
connected.update(self.connected(edge.source))
|
425
|
+
target = max(connected)
|
426
|
+
if not target.canonical:
|
427
|
+
canonical = Identifier.make()
|
428
|
+
self._remove_edge(edge)
|
429
|
+
self.decide(edge.source, canonical, judgement=judgement, user=user)
|
430
|
+
self.decide(edge.target, canonical, judgement=judgement, user=user)
|
431
|
+
return canonical
|
432
|
+
|
433
|
+
edge.judgement = judgement
|
434
|
+
edge.created_at = timestamp()
|
435
|
+
edge.user = user or getpass.getuser()
|
436
|
+
edge.score = score or edge.score
|
437
|
+
self._register(edge)
|
438
|
+
if judgement != Judgement.NO_JUDGEMENT:
|
439
|
+
self._invalidate()
|
440
|
+
return edge.target
|
441
|
+
|
442
|
+
def _register(self, edge: Edge) -> None:
|
443
|
+
"""Ensure the edge exists in the resolver, as provided."""
|
444
|
+
if edge.judgement != Judgement.NO_JUDGEMENT:
|
445
|
+
edge.score = None
|
446
|
+
|
447
|
+
ustmt = update(self._table)
|
448
|
+
ustmt = ustmt.values({"deleted_at": edge.created_at})
|
449
|
+
ustmt = ustmt.where(self._table.c.source == edge.source.id)
|
450
|
+
ustmt = ustmt.where(self._table.c.target == edge.target.id)
|
451
|
+
ustmt = ustmt.where(self._table.c.deleted_at.is_(None))
|
452
|
+
self._get_connection().execute(ustmt)
|
453
|
+
|
454
|
+
stmt = insert(self._table).values(edge.to_dict())
|
455
|
+
self._get_connection().execute(stmt)
|
456
|
+
self._update_edge(edge)
|
457
|
+
|
458
|
+
def _remove_edge(self, edge: Edge) -> None:
|
459
|
+
"""Remove an edge from the graph."""
|
460
|
+
edge.deleted_at = timestamp()
|
461
|
+
stmt = update(self._table)
|
462
|
+
stmt = stmt.values({"deleted_at": edge.deleted_at})
|
463
|
+
stmt = stmt.where(self._table.c.target == edge.target.id)
|
464
|
+
stmt = stmt.where(self._table.c.source == edge.source.id)
|
465
|
+
stmt = stmt.where(self._table.c.deleted_at.is_(None))
|
466
|
+
self._get_connection().execute(stmt)
|
467
|
+
self._update_edge(edge)
|
468
|
+
|
469
|
+
def _remove_node(self, node: Identifier) -> None:
|
470
|
+
"""Remove a node from the graph."""
|
471
|
+
deleted_at = timestamp()
|
472
|
+
stmt = update(self._table)
|
473
|
+
stmt = stmt.values({"deleted_at": deleted_at})
|
474
|
+
cond = or_(
|
475
|
+
self._table.c.source == node.id,
|
476
|
+
self._table.c.target == node.id,
|
477
|
+
)
|
478
|
+
stmt = stmt.where(cond)
|
479
|
+
stmt = stmt.where(self._table.c.deleted_at.is_(None))
|
480
|
+
self._get_connection().execute(stmt)
|
481
|
+
|
482
|
+
edges = self.nodes.get(node)
|
483
|
+
if edges is None:
|
484
|
+
return
|
485
|
+
for edge in list(edges):
|
486
|
+
edge.deleted_at = deleted_at
|
487
|
+
if edge.judgement != Judgement.NO_JUDGEMENT:
|
488
|
+
self._update_edge(edge)
|
489
|
+
|
490
|
+
def remove(self, node_id: StrIdent) -> None:
|
491
|
+
"""Remove all edges linking to the given node from the graph."""
|
492
|
+
node = Identifier.get(node_id)
|
493
|
+
self._remove_node(node)
|
494
|
+
self._invalidate()
|
495
|
+
|
496
|
+
def explode(self, node_id: StrIdent) -> Set[str]:
|
497
|
+
"""Dissolve all edges linked to the cluster to which the node belongs.
|
498
|
+
This is the hard way to make sure we re-do context once we realise
|
499
|
+
there's been a mistake."""
|
500
|
+
node = Identifier.get(node_id)
|
501
|
+
affected: Set[str] = set()
|
502
|
+
for part in self.connected(node):
|
503
|
+
affected.add(str(part))
|
504
|
+
self._remove_node(part)
|
505
|
+
self._invalidate()
|
506
|
+
return affected
|
507
|
+
|
508
|
+
def prune(self) -> None:
|
509
|
+
"""Remove suggested (i.e. NO_JUDGEMENT) edges."""
|
510
|
+
# database
|
511
|
+
stmt = delete(self._table)
|
512
|
+
stmt = stmt.where(self._table.c.judgement == Judgement.NO_JUDGEMENT.value)
|
513
|
+
self._get_connection().execute(stmt)
|
514
|
+
|
515
|
+
# local state
|
516
|
+
now = timestamp()
|
517
|
+
for edge in list(self.edges.values()):
|
518
|
+
if edge.judgement == Judgement.NO_JUDGEMENT:
|
519
|
+
edge.deleted_at = now
|
520
|
+
self._update_edge(edge)
|
521
|
+
|
522
|
+
def apply_statement(self, stmt: Statement) -> Statement:
|
523
|
+
"""Canonicalise Statement Entity IDs and ID values"""
|
524
|
+
if stmt.entity_id is not None:
|
525
|
+
stmt.canonical_id = self.get_canonical(stmt.entity_id)
|
526
|
+
if stmt.prop_type == registry.entity.name:
|
527
|
+
canon_value = self.get_canonical(stmt.value)
|
528
|
+
if canon_value != stmt.value:
|
529
|
+
if stmt.original_value is None:
|
530
|
+
stmt.original_value = stmt.value
|
531
|
+
# NOTE: this means the key is out of whack here now
|
532
|
+
stmt.value = canon_value
|
533
|
+
return stmt
|
534
|
+
|
535
|
+
def dump(self, path: PathLike) -> None:
|
536
|
+
"""Store the resolver adjacency list to a plain text JSON list."""
|
537
|
+
stmt = self._table.select()
|
538
|
+
stmt = stmt.where(self._table.c.judgement != Judgement.NO_JUDGEMENT.value)
|
539
|
+
stmt.order_by(self._table.c.created_at.asc())
|
540
|
+
with open(path, "w") as fh:
|
541
|
+
cursor = self._get_connection().execute(stmt)
|
542
|
+
for row in cursor.yield_per(20000):
|
543
|
+
edge = Edge.from_dict(row._mapping)
|
544
|
+
fh.write(edge.to_line())
|
545
|
+
|
546
|
+
def load(self, path: PathLike) -> None:
|
547
|
+
"""Load edges directly into the database"""
|
548
|
+
edge_count = 0
|
549
|
+
with open(path, "r") as fh:
|
550
|
+
while True:
|
551
|
+
line = fh.readline()
|
552
|
+
if not line:
|
553
|
+
break
|
554
|
+
edge = Edge.from_line(line)
|
555
|
+
self._register(edge)
|
556
|
+
edge_count += 1
|
557
|
+
if edge_count % 10000 == 0:
|
558
|
+
log.info("Loaded %s edges." % edge_count)
|
559
|
+
log.info("Done. Loaded %s edges." % edge_count)
|
560
|
+
self._invalidate()
|
561
|
+
|
562
|
+
def __repr__(self) -> str:
|
563
|
+
parts = self._engine.url
|
564
|
+
url = f"{parts.drivername}://{parts.host or ''}/{parts.database}/{self._table.name}"
|
565
|
+
return f"<Resolver({url})>"
|
nomenklatura/settings.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from rigour.env import env_str, env_int
|
3
|
+
|
4
|
+
TESTING = False
|
5
|
+
|
6
|
+
DB_PATH = Path("nomenklatura.db").resolve()
|
7
|
+
DEFAULT_DB_URL = f"sqlite:///{DB_PATH.as_posix()}"
|
8
|
+
DB_URL = env_str("NOMENKLATURA_DB_URL", "")
|
9
|
+
if DB_URL is None or not len(DB_URL):
|
10
|
+
DB_URL = DEFAULT_DB_URL
|
11
|
+
DB_POOL_SIZE = env_int("NOMENKLATURA_DB_POOL_SIZE", 5)
|
12
|
+
DB_STMT_TIMEOUT = env_int("NOMENKLATURA_DB_STMT_TIMEOUT", 10000)
|
13
|
+
|
14
|
+
REDIS_URL = env_str("NOMENKLATURA_REDIS_URL", "")
|
15
|
+
|
16
|
+
STATEMENT_TABLE = env_str("NOMENKLATURA_STATEMENT_TABLE", "statement")
|
17
|
+
STATEMENT_BATCH = env_int("NOMENKLATURA_STATEMENT_BATCH", 3000)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import orjson
|
5
|
+
|
6
|
+
from followthemoney import Dataset, StatementEntity
|
7
|
+
from nomenklatura.resolver import Resolver
|
8
|
+
from nomenklatura.store.base import Store, View, Writer
|
9
|
+
from nomenklatura.store.memory import MemoryStore
|
10
|
+
from nomenklatura.store.sql import SQLStore
|
11
|
+
|
12
|
+
SimpleMemoryStore = MemoryStore[Dataset, StatementEntity]
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
"Store",
|
16
|
+
"Writer",
|
17
|
+
"View",
|
18
|
+
"MemoryStore",
|
19
|
+
"SimpleMemoryStore",
|
20
|
+
"SQLStore",
|
21
|
+
"load_entity_file_store",
|
22
|
+
]
|
23
|
+
|
24
|
+
|
25
|
+
def load_entity_file_store(
|
26
|
+
path: Path,
|
27
|
+
resolver: Resolver[StatementEntity],
|
28
|
+
dataset: Optional[Dataset] = None,
|
29
|
+
cleaned: bool = True,
|
30
|
+
) -> SimpleMemoryStore:
|
31
|
+
"""Create a simple in-memory store by reading FtM entities from a file path."""
|
32
|
+
if dataset is None:
|
33
|
+
dataset = Dataset.make({"name": path.stem, "title": path.stem})
|
34
|
+
store = MemoryStore(dataset, resolver)
|
35
|
+
with store.writer() as writer:
|
36
|
+
with open(path, "rb") as fh:
|
37
|
+
while line := fh.readline():
|
38
|
+
data = orjson.loads(line)
|
39
|
+
proxy = StatementEntity.from_data(dataset, data, cleaned=cleaned)
|
40
|
+
writer.add_entity(proxy)
|
41
|
+
return store
|