nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,565 @@
1
+ #
2
+ # Don't forget to call self._invalidate from methods that modify edges.
3
+ #
4
+ import getpass
5
+ import logging
6
+ from collections import defaultdict
7
+ from functools import lru_cache
8
+ from typing import Any, Dict, Generator, List, Optional, Set, Tuple
9
+ from rigour.ids.wikidata import is_qid
10
+ from rigour.time import utc_now
11
+ from sqlalchemy import (
12
+ Column,
13
+ Float,
14
+ Index,
15
+ Integer,
16
+ MetaData,
17
+ Table,
18
+ Unicode,
19
+ or_,
20
+ text,
21
+ )
22
+ from sqlalchemy.engine import Connection, Engine, Transaction
23
+ from sqlalchemy.sql.expression import delete, insert, update
24
+ from followthemoney import registry, Statement, SE
25
+ from followthemoney.util import PathLike
26
+
27
+ from nomenklatura.db import get_engine
28
+ from nomenklatura.judgement import Judgement
29
+ from nomenklatura.resolver.edge import Edge
30
+ from nomenklatura.resolver.identifier import Identifier, Pair, StrIdent
31
+ from nomenklatura.resolver.linker import Linker
32
+
33
+
34
+ log = logging.getLogger(__name__)
35
+
36
+
37
+ def timestamp() -> str:
38
+ return utc_now().isoformat()[:28]
39
+
40
+
41
+ class Resolver(Linker[SE]):
42
+ UNDECIDED = (Judgement.NO_JUDGEMENT, Judgement.UNSURE)
43
+
44
+ def __init__(
45
+ self,
46
+ engine: Engine,
47
+ metadata: MetaData,
48
+ create: bool = False,
49
+ table_name: str = "resolver",
50
+ ) -> None:
51
+ self._engine = engine
52
+ self._conn: Optional[Connection] = None
53
+ self._transaction: Optional[Transaction] = None
54
+ # Start with None to skip deletes on first BEGIN.
55
+ # We don't have to process deletes to represent the state on first load.
56
+ self._max_ts: Optional[str] = None
57
+ self.edges: Dict[Pair, Edge] = {}
58
+ self.nodes: Dict[Identifier, Set[Edge]] = defaultdict(set)
59
+
60
+ unique_kw: Dict[str, Any] = {"unique": True}
61
+ if engine.dialect.name == "sqlite":
62
+ unique_kw["sqlite_where"] = text("deleted_at IS NULL")
63
+ if engine.dialect.name in ("postgresql", "postgres"):
64
+ unique_kw["postgresql_where"] = text("deleted_at IS NULL")
65
+ unique_pair = Index(
66
+ f"{table_name}_source_target_uniq",
67
+ text("source"),
68
+ text("target"),
69
+ **unique_kw,
70
+ )
71
+ self._table = Table(
72
+ table_name,
73
+ metadata,
74
+ Column("id", Integer(), primary_key=True),
75
+ Column("target", Unicode(512), index=True),
76
+ Column("source", Unicode(512), index=True),
77
+ Column("judgement", Unicode(14), nullable=False),
78
+ Column("score", Float, nullable=True),
79
+ Column("user", Unicode(512), nullable=False),
80
+ Column("created_at", Unicode(28)),
81
+ Column("deleted_at", Unicode(28), nullable=True),
82
+ unique_pair,
83
+ extend_existing=True,
84
+ )
85
+ if create:
86
+ metadata.create_all(bind=engine, checkfirst=True, tables=[self._table])
87
+
88
+ def _update_from_db(self) -> None:
89
+ """Apply new deletes and unseen edges from the database."""
90
+ stmt = self._table.select()
91
+ if self._max_ts is None:
92
+ stmt = stmt.where(self._table.c.deleted_at.is_(None))
93
+ else:
94
+ stmt = stmt.where(
95
+ or_(
96
+ self._table.c.deleted_at > self._max_ts,
97
+ self._table.c.created_at > self._max_ts,
98
+ )
99
+ )
100
+ stmt.order_by(self._table.c.deleted_at.asc().nulls_last())
101
+ stmt.order_by(self._table.c.created_at.asc())
102
+ cursor = self._get_connection().execute(stmt)
103
+ while batch := cursor.fetchmany(10000):
104
+ for row in batch:
105
+ edge = Edge.from_dict(row._mapping)
106
+ if self._max_ts is None:
107
+ self._max_ts = edge.created_at
108
+ if self._max_ts is not None:
109
+ if edge.created_at is not None:
110
+ self._max_ts = max(self._max_ts, edge.created_at)
111
+ if edge.deleted_at is not None:
112
+ self._max_ts = max(self._max_ts, edge.deleted_at)
113
+ self._update_edge(edge)
114
+ cursor.close()
115
+
116
+ def _update_edge(self, edge: Edge) -> None:
117
+ if edge.deleted_at is None:
118
+ if edge.judgement != Judgement.NO_JUDGEMENT:
119
+ edge.score = None
120
+ self.edges[edge.key] = edge
121
+ self.nodes[edge.source].add(edge)
122
+ self.nodes[edge.target].add(edge)
123
+ else:
124
+ self.edges.pop(edge.key, None)
125
+ for node in (edge.source, edge.target):
126
+ if node in self.nodes:
127
+ self.nodes[node].discard(edge)
128
+ if len(self.nodes[node]) == 0:
129
+ del self.nodes[node]
130
+
131
+ @classmethod
132
+ def make_default(cls, engine: Optional[Engine] = None) -> "Resolver[SE]":
133
+ if engine is None:
134
+ engine = get_engine()
135
+ meta = MetaData()
136
+ return cls(engine, meta, create=True)
137
+
138
+ def _invalidate(self) -> None:
139
+ self.connected.cache_clear()
140
+ self.get_canonical.cache_clear()
141
+
142
+ def begin(self) -> None:
143
+ """
144
+ Start a new transaction in Begin Once style. Callers are responsible for
145
+ committing or rolling back the transaction.
146
+
147
+ https://docs.sqlalchemy.org/en/20/core/connections.html#begin-once
148
+ """
149
+ if self._conn is None:
150
+ self._conn = self._engine.connect()
151
+ if self._transaction is None:
152
+ self._transaction = self._conn.begin()
153
+ self._update_from_db()
154
+ self._invalidate()
155
+
156
+ def commit(self) -> None:
157
+ if self._transaction is None or self._conn is None:
158
+ self._transaction = None
159
+ self._conn = None
160
+ return
161
+
162
+ # Swipe up all NO JUDGEMENT edges that have been deleted:
163
+ clean_stmt = delete(self._table)
164
+ clean_stmt = clean_stmt.where(
165
+ self._table.c.judgement == Judgement.NO_JUDGEMENT.value
166
+ )
167
+ clean_stmt = clean_stmt.where(self._table.c.deleted_at.is_not(None))
168
+ self._conn.execute(clean_stmt)
169
+
170
+ self._transaction.commit()
171
+ self._transaction = None
172
+ self._conn.close()
173
+ self._conn = None
174
+
175
+ def rollback(self) -> None:
176
+ if self._transaction is not None:
177
+ self._transaction.rollback()
178
+ self._transaction = None
179
+ if self._conn is not None:
180
+ self._conn.close()
181
+ self._conn = None
182
+
183
+ def close(self) -> None:
184
+ """Close the resolver connection."""
185
+ if self._transaction is not None:
186
+ self._transaction.rollback()
187
+ self._transaction = None
188
+ if self._conn is not None:
189
+ self._conn.close()
190
+ self._conn = None
191
+ self.edges.clear()
192
+ self.nodes.clear()
193
+ self._max_ts = None
194
+ self._invalidate()
195
+
196
+ def _get_connection(self) -> Connection:
197
+ if self._transaction is None or self._conn is None:
198
+ raise RuntimeError("No transaction in progress.")
199
+ return self._conn
200
+
201
+ def get_linker(self) -> Linker[SE]:
202
+ """Return a linker object that can be used to resolve entities.
203
+ This is less memory-consuming than the full resolver object.
204
+ """
205
+ entities: Dict[Identifier, Set[Identifier]] = {}
206
+ stmt = self._table.select()
207
+ stmt = stmt.where(self._table.c.judgement == Judgement.POSITIVE.value)
208
+ stmt = stmt.where(self._table.c.deleted_at.is_(None))
209
+ stmt.order_by(self._table.c.created_at.asc())
210
+ with self._engine.connect() as conn:
211
+ cursor = conn.execute(stmt)
212
+ while batch := cursor.fetchmany(20000):
213
+ for row in batch:
214
+ edge = Edge.from_dict(row._mapping)
215
+ cluster = entities.get(edge.source)
216
+ if cluster is None:
217
+ cluster = set([edge.source])
218
+ other = entities.get(edge.target)
219
+ if other is None:
220
+ other = set([edge.target])
221
+ cluster.update(other)
222
+ for node in cluster:
223
+ entities[node] = cluster
224
+ cursor.close()
225
+ return Linker(entities)
226
+
227
+ def get_edge(self, left_id: StrIdent, right_id: StrIdent) -> Optional[Edge]:
228
+ key = Identifier.pair(left_id, right_id)
229
+ return self.edges.get(key)
230
+
231
+ def _traverse(self, node: Identifier, seen: Set[Identifier]) -> Set[Identifier]:
232
+ """Returns the set of nodes connected to the given node via positive judgement."""
233
+ connected = set([node])
234
+ if node in seen:
235
+ return connected
236
+ seen.add(node)
237
+ for edge in self.nodes.get(node, []):
238
+ if edge.judgement == Judgement.POSITIVE:
239
+ other = edge.other(node)
240
+ rec = self._traverse(other, seen)
241
+ connected.update(rec)
242
+ return connected
243
+
244
+ @lru_cache(maxsize=200000)
245
+ def connected(self, node: Identifier) -> Set[Identifier]:
246
+ return self._traverse(node, set())
247
+
248
+ @lru_cache(maxsize=200000)
249
+ def get_canonical(self, entity_id: StrIdent) -> str:
250
+ """Return the canonical identifier for the given entity ID."""
251
+ node = Identifier.get(entity_id)
252
+ max_ = max(self.connected(node))
253
+ if max_.canonical:
254
+ return max_.id
255
+ return node.id
256
+
257
+ def canonicals(self) -> Generator[Identifier, None, None]:
258
+ """Return all the canonical cluster identifiers."""
259
+ for node in self.nodes.keys():
260
+ if not node.canonical:
261
+ continue
262
+ canonical = self.get_canonical(node)
263
+ if canonical == node.id:
264
+ yield node
265
+
266
+ def get_referents(
267
+ self, canonical_id: StrIdent, canonicals: bool = True
268
+ ) -> Set[str]:
269
+ """Get all the non-canonical entity identifiers which refer to a given
270
+ canonical identifier."""
271
+ node = Identifier.get(canonical_id)
272
+ referents: Set[str] = set()
273
+ for connected in self.connected(node):
274
+ if not canonicals and connected.canonical:
275
+ continue
276
+ if connected == node:
277
+ continue
278
+ referents.add(connected.id)
279
+ return referents
280
+
281
+ def get_resolved_edge(
282
+ self, left_id: StrIdent, right_id: StrIdent
283
+ ) -> Optional[Edge]:
284
+ """
285
+ Return _some_ edge that connects the two entities, if it exists.
286
+ """
287
+ (left, right) = Identifier.pair(left_id, right_id)
288
+ left_connected = self.connected(left)
289
+ right_connected = self.connected(right)
290
+ for e in left_connected:
291
+ for o in right_connected:
292
+ if e == o:
293
+ continue
294
+ edge = self.edges.get(Identifier.pair(e, o))
295
+ if edge is None:
296
+ continue
297
+ return edge
298
+ return None
299
+
300
+ def _pair_judgement(self, left: Identifier, right: Identifier) -> Judgement:
301
+ edge = self.get_edge(left, right)
302
+ if edge is not None:
303
+ return edge.judgement
304
+ return Judgement.NO_JUDGEMENT
305
+
306
+ def get_judgement(self, entity_id: StrIdent, other_id: StrIdent) -> Judgement:
307
+ """Get the existing decision between two entities with dedupe factored in."""
308
+ entity = Identifier.get(entity_id)
309
+ other = Identifier.get(other_id)
310
+ if entity == other:
311
+ return Judgement.POSITIVE
312
+ entity_connected = self.connected(entity)
313
+ if other in entity_connected:
314
+ return Judgement.POSITIVE
315
+ # Check QIDs after connected because we sometimes insert an edge to say
316
+ # one QID is canonical for another. Not common but important.
317
+ if is_qid(entity.id) and is_qid(other.id):
318
+ return Judgement.NEGATIVE
319
+
320
+ # HACK: this would mark pairs only as unsure if the unsure judgement
321
+ # had been made on the current canonical combination:
322
+ # canon_edge = self._pair_judgement(max(entity_connected), max(other_connected))
323
+ # if canon_edge == Judgement.UNSURE:
324
+ # return Judgement.UNSURE
325
+
326
+ other_connected = self.connected(other)
327
+ for e in entity_connected:
328
+ for o in other_connected:
329
+ judgement = self._pair_judgement(e, o)
330
+ if judgement != Judgement.NO_JUDGEMENT:
331
+ return judgement
332
+
333
+ return Judgement.NO_JUDGEMENT
334
+
335
+ def check_candidate(self, left: StrIdent, right: StrIdent) -> bool:
336
+ """Check if the two IDs could be merged, i.e. if there's no existing
337
+ judgement."""
338
+ judgement = self.get_judgement(left, right)
339
+ return judgement == Judgement.NO_JUDGEMENT
340
+
341
+ def get_judgements(
342
+ self, limit: Optional[int] = None
343
+ ) -> Generator[Edge, None, None]:
344
+ """Get most recently updated edges other than NO_JUDGEMENT."""
345
+ stmt = self._table.select()
346
+ stmt = stmt.where(self._table.c.judgement != Judgement.NO_JUDGEMENT.value)
347
+ stmt = stmt.where(self._table.c.deleted_at.is_(None))
348
+ stmt = stmt.order_by(self._table.c.created_at.desc())
349
+ if limit is not None:
350
+ stmt = stmt.limit(limit)
351
+ cursor = self._get_connection().execute(stmt)
352
+ while batch := cursor.fetchmany(25):
353
+ for row in batch:
354
+ yield Edge.from_dict(row._mapping)
355
+ cursor.close()
356
+
357
+ def _get_suggested(self) -> List[Edge]:
358
+ """Get all NO_JUDGEMENT edges in descending order of score."""
359
+ edges_all = self.edges.values()
360
+ candidates = (e for e in edges_all if e.judgement == Judgement.NO_JUDGEMENT)
361
+ cmp = lambda x: x.score or -1.0 # noqa
362
+ return sorted(candidates, key=cmp, reverse=True)
363
+
364
+ def get_candidates(
365
+ self, limit: Optional[int] = None
366
+ ) -> Generator[Tuple[str, str, Optional[float]], None, None]:
367
+ returned = 0
368
+ for edge in self._get_suggested():
369
+ if not self.check_candidate(edge.source, edge.target):
370
+ continue
371
+ yield edge.target.id, edge.source.id, edge.score
372
+ returned += 1
373
+ if limit is not None and returned >= limit:
374
+ break
375
+
376
+ def suggest(
377
+ self,
378
+ left_id: StrIdent,
379
+ right_id: StrIdent,
380
+ score: float,
381
+ user: Optional[str] = None,
382
+ ) -> Identifier:
383
+ """Make a NO_JUDGEMENT link between two identifiers to suggest that a user
384
+ should make a decision about whether they are the same or not."""
385
+ edge = self.get_edge(left_id, right_id)
386
+ if edge is not None:
387
+ if edge.judgement == Judgement.NO_JUDGEMENT:
388
+ # Just update score
389
+
390
+ # database
391
+ stmt = update(self._table)
392
+ stmt = stmt.where(self._table.c.target == edge.target.id)
393
+ stmt = stmt.where(self._table.c.source == edge.source.id)
394
+ stmt = stmt.where(self._table.c.deleted_at.is_(None))
395
+ stmt = stmt.where(
396
+ self._table.c.judgement == Judgement.NO_JUDGEMENT.value
397
+ )
398
+ stmt = stmt.values({"score": score})
399
+ self._get_connection().execute(stmt)
400
+
401
+ # local state
402
+ edge.score = score
403
+ return edge.target
404
+ return self.decide(
405
+ left_id, right_id, Judgement.NO_JUDGEMENT, score=score, user=user
406
+ )
407
+
408
+ def decide(
409
+ self,
410
+ left_id: StrIdent,
411
+ right_id: StrIdent,
412
+ judgement: Judgement,
413
+ user: Optional[str] = None,
414
+ score: Optional[float] = None,
415
+ ) -> Identifier:
416
+ edge = self.get_edge(left_id, right_id)
417
+ if edge is None:
418
+ edge = Edge(left_id, right_id, judgement=judgement)
419
+
420
+ # Canonicalise positive matches, i.e. make both identifiers refer to a
421
+ # canonical identifier, instead of making a direct link.
422
+ if judgement == Judgement.POSITIVE:
423
+ connected = set(self.connected(edge.target))
424
+ connected.update(self.connected(edge.source))
425
+ target = max(connected)
426
+ if not target.canonical:
427
+ canonical = Identifier.make()
428
+ self._remove_edge(edge)
429
+ self.decide(edge.source, canonical, judgement=judgement, user=user)
430
+ self.decide(edge.target, canonical, judgement=judgement, user=user)
431
+ return canonical
432
+
433
+ edge.judgement = judgement
434
+ edge.created_at = timestamp()
435
+ edge.user = user or getpass.getuser()
436
+ edge.score = score or edge.score
437
+ self._register(edge)
438
+ if judgement != Judgement.NO_JUDGEMENT:
439
+ self._invalidate()
440
+ return edge.target
441
+
442
+ def _register(self, edge: Edge) -> None:
443
+ """Ensure the edge exists in the resolver, as provided."""
444
+ if edge.judgement != Judgement.NO_JUDGEMENT:
445
+ edge.score = None
446
+
447
+ ustmt = update(self._table)
448
+ ustmt = ustmt.values({"deleted_at": edge.created_at})
449
+ ustmt = ustmt.where(self._table.c.source == edge.source.id)
450
+ ustmt = ustmt.where(self._table.c.target == edge.target.id)
451
+ ustmt = ustmt.where(self._table.c.deleted_at.is_(None))
452
+ self._get_connection().execute(ustmt)
453
+
454
+ stmt = insert(self._table).values(edge.to_dict())
455
+ self._get_connection().execute(stmt)
456
+ self._update_edge(edge)
457
+
458
+ def _remove_edge(self, edge: Edge) -> None:
459
+ """Remove an edge from the graph."""
460
+ edge.deleted_at = timestamp()
461
+ stmt = update(self._table)
462
+ stmt = stmt.values({"deleted_at": edge.deleted_at})
463
+ stmt = stmt.where(self._table.c.target == edge.target.id)
464
+ stmt = stmt.where(self._table.c.source == edge.source.id)
465
+ stmt = stmt.where(self._table.c.deleted_at.is_(None))
466
+ self._get_connection().execute(stmt)
467
+ self._update_edge(edge)
468
+
469
+ def _remove_node(self, node: Identifier) -> None:
470
+ """Remove a node from the graph."""
471
+ deleted_at = timestamp()
472
+ stmt = update(self._table)
473
+ stmt = stmt.values({"deleted_at": deleted_at})
474
+ cond = or_(
475
+ self._table.c.source == node.id,
476
+ self._table.c.target == node.id,
477
+ )
478
+ stmt = stmt.where(cond)
479
+ stmt = stmt.where(self._table.c.deleted_at.is_(None))
480
+ self._get_connection().execute(stmt)
481
+
482
+ edges = self.nodes.get(node)
483
+ if edges is None:
484
+ return
485
+ for edge in list(edges):
486
+ edge.deleted_at = deleted_at
487
+ if edge.judgement != Judgement.NO_JUDGEMENT:
488
+ self._update_edge(edge)
489
+
490
+ def remove(self, node_id: StrIdent) -> None:
491
+ """Remove all edges linking to the given node from the graph."""
492
+ node = Identifier.get(node_id)
493
+ self._remove_node(node)
494
+ self._invalidate()
495
+
496
+ def explode(self, node_id: StrIdent) -> Set[str]:
497
+ """Dissolve all edges linked to the cluster to which the node belongs.
498
+ This is the hard way to make sure we re-do context once we realise
499
+ there's been a mistake."""
500
+ node = Identifier.get(node_id)
501
+ affected: Set[str] = set()
502
+ for part in self.connected(node):
503
+ affected.add(str(part))
504
+ self._remove_node(part)
505
+ self._invalidate()
506
+ return affected
507
+
508
+ def prune(self) -> None:
509
+ """Remove suggested (i.e. NO_JUDGEMENT) edges."""
510
+ # database
511
+ stmt = delete(self._table)
512
+ stmt = stmt.where(self._table.c.judgement == Judgement.NO_JUDGEMENT.value)
513
+ self._get_connection().execute(stmt)
514
+
515
+ # local state
516
+ now = timestamp()
517
+ for edge in list(self.edges.values()):
518
+ if edge.judgement == Judgement.NO_JUDGEMENT:
519
+ edge.deleted_at = now
520
+ self._update_edge(edge)
521
+
522
+ def apply_statement(self, stmt: Statement) -> Statement:
523
+ """Canonicalise Statement Entity IDs and ID values"""
524
+ if stmt.entity_id is not None:
525
+ stmt.canonical_id = self.get_canonical(stmt.entity_id)
526
+ if stmt.prop_type == registry.entity.name:
527
+ canon_value = self.get_canonical(stmt.value)
528
+ if canon_value != stmt.value:
529
+ if stmt.original_value is None:
530
+ stmt.original_value = stmt.value
531
+ # NOTE: this means the key is out of whack here now
532
+ stmt.value = canon_value
533
+ return stmt
534
+
535
+ def dump(self, path: PathLike) -> None:
536
+ """Store the resolver adjacency list to a plain text JSON list."""
537
+ stmt = self._table.select()
538
+ stmt = stmt.where(self._table.c.judgement != Judgement.NO_JUDGEMENT.value)
539
+ stmt.order_by(self._table.c.created_at.asc())
540
+ with open(path, "w") as fh:
541
+ cursor = self._get_connection().execute(stmt)
542
+ for row in cursor.yield_per(20000):
543
+ edge = Edge.from_dict(row._mapping)
544
+ fh.write(edge.to_line())
545
+
546
+ def load(self, path: PathLike) -> None:
547
+ """Load edges directly into the database"""
548
+ edge_count = 0
549
+ with open(path, "r") as fh:
550
+ while True:
551
+ line = fh.readline()
552
+ if not line:
553
+ break
554
+ edge = Edge.from_line(line)
555
+ self._register(edge)
556
+ edge_count += 1
557
+ if edge_count % 10000 == 0:
558
+ log.info("Loaded %s edges." % edge_count)
559
+ log.info("Done. Loaded %s edges." % edge_count)
560
+ self._invalidate()
561
+
562
+ def __repr__(self) -> str:
563
+ parts = self._engine.url
564
+ url = f"{parts.drivername}://{parts.host or ''}/{parts.database}/{self._table.name}"
565
+ return f"<Resolver({url})>"
@@ -0,0 +1,17 @@
1
+ from pathlib import Path
2
+ from rigour.env import env_str, env_int
3
+
4
+ TESTING = False
5
+
6
+ DB_PATH = Path("nomenklatura.db").resolve()
7
+ DEFAULT_DB_URL = f"sqlite:///{DB_PATH.as_posix()}"
8
+ DB_URL = env_str("NOMENKLATURA_DB_URL", "")
9
+ if DB_URL is None or not len(DB_URL):
10
+ DB_URL = DEFAULT_DB_URL
11
+ DB_POOL_SIZE = env_int("NOMENKLATURA_DB_POOL_SIZE", 5)
12
+ DB_STMT_TIMEOUT = env_int("NOMENKLATURA_DB_STMT_TIMEOUT", 10000)
13
+
14
+ REDIS_URL = env_str("NOMENKLATURA_REDIS_URL", "")
15
+
16
+ STATEMENT_TABLE = env_str("NOMENKLATURA_STATEMENT_TABLE", "statement")
17
+ STATEMENT_BATCH = env_int("NOMENKLATURA_STATEMENT_BATCH", 3000)
@@ -0,0 +1,41 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ import orjson
5
+
6
+ from followthemoney import Dataset, StatementEntity
7
+ from nomenklatura.resolver import Resolver
8
+ from nomenklatura.store.base import Store, View, Writer
9
+ from nomenklatura.store.memory import MemoryStore
10
+ from nomenklatura.store.sql import SQLStore
11
+
12
+ SimpleMemoryStore = MemoryStore[Dataset, StatementEntity]
13
+
14
+ __all__ = [
15
+ "Store",
16
+ "Writer",
17
+ "View",
18
+ "MemoryStore",
19
+ "SimpleMemoryStore",
20
+ "SQLStore",
21
+ "load_entity_file_store",
22
+ ]
23
+
24
+
25
+ def load_entity_file_store(
26
+ path: Path,
27
+ resolver: Resolver[StatementEntity],
28
+ dataset: Optional[Dataset] = None,
29
+ cleaned: bool = True,
30
+ ) -> SimpleMemoryStore:
31
+ """Create a simple in-memory store by reading FtM entities from a file path."""
32
+ if dataset is None:
33
+ dataset = Dataset.make({"name": path.stem, "title": path.stem})
34
+ store = MemoryStore(dataset, resolver)
35
+ with store.writer() as writer:
36
+ with open(path, "rb") as fh:
37
+ while line := fh.readline():
38
+ data = orjson.loads(line)
39
+ proxy = StatementEntity.from_data(dataset, data, cleaned=cleaned)
40
+ writer.add_entity(proxy)
41
+ return store