nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,219 @@
|
|
1
|
+
from typing import Any, Generator, List, Optional, Set, Tuple
|
2
|
+
|
3
|
+
from followthemoney import DS, SE, Property, Schema, Statement
|
4
|
+
from sqlalchemy import Table, delete, func, select
|
5
|
+
from sqlalchemy.engine import Engine, Transaction, create_engine
|
6
|
+
from sqlalchemy.dialects.postgresql import insert as psql_insert
|
7
|
+
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
8
|
+
from sqlalchemy.sql.selectable import Select
|
9
|
+
|
10
|
+
from nomenklatura import settings
|
11
|
+
from nomenklatura.db import get_metadata
|
12
|
+
from nomenklatura.resolver import Linker, Identifier
|
13
|
+
from nomenklatura.db import make_statement_table
|
14
|
+
from nomenklatura.store import Store, View, Writer
|
15
|
+
|
16
|
+
|
17
|
+
class SQLStore(Store[DS, SE]):
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
dataset: DS,
|
21
|
+
linker: Linker[SE],
|
22
|
+
uri: str = settings.DB_URL,
|
23
|
+
**engine_kwargs: Any,
|
24
|
+
):
|
25
|
+
super().__init__(dataset, linker)
|
26
|
+
if "pool_size" not in engine_kwargs:
|
27
|
+
engine_kwargs["pool_size"] = settings.DB_POOL_SIZE
|
28
|
+
# if uri.lower().startswith("sqlite"):
|
29
|
+
# engine_kwargs.pop("pool_size", None)
|
30
|
+
metadata = get_metadata()
|
31
|
+
self.engine: Engine = create_engine(uri, **engine_kwargs)
|
32
|
+
self.table = make_statement_table(metadata)
|
33
|
+
metadata.create_all(self.engine, tables=[self.table], checkfirst=True)
|
34
|
+
|
35
|
+
def writer(self) -> Writer[DS, SE]:
|
36
|
+
return SQLWriter(self)
|
37
|
+
|
38
|
+
def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
|
39
|
+
return SQLView(self, scope, external=external)
|
40
|
+
|
41
|
+
def _execute(
|
42
|
+
self, q: Select[Any], stream: bool = True
|
43
|
+
) -> Generator[Any, None, None]:
|
44
|
+
# execute any read query against sql backend
|
45
|
+
with self.engine.connect() as conn:
|
46
|
+
if stream:
|
47
|
+
conn = conn.execution_options(stream_results=True)
|
48
|
+
cursor = conn.execute(q)
|
49
|
+
while rows := cursor.fetchmany(10_000):
|
50
|
+
yield from rows
|
51
|
+
|
52
|
+
def _iterate_stmts(
|
53
|
+
self, q: Select[Any], stream: bool = True
|
54
|
+
) -> Generator[Statement, None, None]:
|
55
|
+
for row in self._execute(q, stream=stream):
|
56
|
+
yield Statement.from_db_row(row)
|
57
|
+
|
58
|
+
def _iterate(
|
59
|
+
self, q: Select[Any], stream: bool = True
|
60
|
+
) -> Generator[SE, None, None]:
|
61
|
+
current_id = None
|
62
|
+
current_stmts: list[Statement] = []
|
63
|
+
for stmt in self._iterate_stmts(q, stream=stream):
|
64
|
+
entity_id = stmt.entity_id
|
65
|
+
if current_id is None:
|
66
|
+
current_id = entity_id
|
67
|
+
if current_id != entity_id:
|
68
|
+
proxy = self.assemble(current_stmts)
|
69
|
+
if proxy is not None:
|
70
|
+
yield proxy
|
71
|
+
current_id = entity_id
|
72
|
+
current_stmts = []
|
73
|
+
current_stmts.append(stmt)
|
74
|
+
if len(current_stmts):
|
75
|
+
proxy = self.assemble(current_stmts)
|
76
|
+
if proxy is not None:
|
77
|
+
yield proxy
|
78
|
+
|
79
|
+
|
80
|
+
class SQLWriter(Writer[DS, SE]):
|
81
|
+
BATCH_STATEMENTS = 10_000
|
82
|
+
|
83
|
+
def __init__(self, store: SQLStore[DS, SE]):
|
84
|
+
self.store: SQLStore[DS, SE] = store
|
85
|
+
self.batch: Set[Statement] = set()
|
86
|
+
self.conn = self.store.engine.connect()
|
87
|
+
self.tx: Optional[Transaction] = None
|
88
|
+
|
89
|
+
def _upsert_batch(self) -> None:
|
90
|
+
if not len(self.batch):
|
91
|
+
return
|
92
|
+
values = [s.to_db_row() for s in self.batch]
|
93
|
+
if self.tx is None:
|
94
|
+
self.tx = self.conn.begin()
|
95
|
+
if self.store.engine.dialect.name == "sqlite":
|
96
|
+
ilstmt = sqlite_insert(self.store.table).values(values)
|
97
|
+
lstmt = ilstmt.on_conflict_do_update(
|
98
|
+
index_elements=["id"],
|
99
|
+
set_=dict(
|
100
|
+
canonical_id=ilstmt.excluded.canonical_id,
|
101
|
+
schema=ilstmt.excluded.schema,
|
102
|
+
prop_type=ilstmt.excluded.prop_type,
|
103
|
+
lang=ilstmt.excluded.lang,
|
104
|
+
original_value=ilstmt.excluded.original_value,
|
105
|
+
last_seen=ilstmt.excluded.last_seen,
|
106
|
+
),
|
107
|
+
)
|
108
|
+
self.conn.execute(lstmt)
|
109
|
+
elif self.store.engine.dialect.name in ("postgresql", "postgres"):
|
110
|
+
ipstmt = psql_insert(self.store.table).values(values)
|
111
|
+
pstmt = ipstmt.on_conflict_do_update(
|
112
|
+
index_elements=["id"],
|
113
|
+
set_=dict(
|
114
|
+
canonical_id=ipstmt.excluded.canonical_id,
|
115
|
+
schema=ipstmt.excluded.schema,
|
116
|
+
prop_type=ipstmt.excluded.prop_type,
|
117
|
+
lang=ipstmt.excluded.lang,
|
118
|
+
original_value=ipstmt.excluded.original_value,
|
119
|
+
last_seen=ipstmt.excluded.last_seen,
|
120
|
+
),
|
121
|
+
)
|
122
|
+
self.conn.execute(pstmt)
|
123
|
+
else:
|
124
|
+
msg = f"Upsert not implemented for dialect {self.store.engine.dialect.name}"
|
125
|
+
raise NotImplementedError(msg)
|
126
|
+
self.batch = set()
|
127
|
+
|
128
|
+
def flush(self) -> None:
|
129
|
+
if len(self.batch):
|
130
|
+
self._upsert_batch()
|
131
|
+
if self.tx is not None:
|
132
|
+
self.tx.commit()
|
133
|
+
self.tx = None
|
134
|
+
|
135
|
+
def add_statement(self, stmt: Statement) -> None:
|
136
|
+
if stmt.entity_id is None:
|
137
|
+
return
|
138
|
+
canonical_id = self.store.linker.get_canonical(stmt.entity_id)
|
139
|
+
stmt.canonical_id = canonical_id
|
140
|
+
self.batch.add(stmt)
|
141
|
+
if len(self.batch) >= self.BATCH_STATEMENTS:
|
142
|
+
self._upsert_batch()
|
143
|
+
|
144
|
+
def pop(self, entity_id: str) -> List[Statement]:
|
145
|
+
if self.tx is None:
|
146
|
+
self.tx = self.conn.begin()
|
147
|
+
|
148
|
+
table = self.store.table
|
149
|
+
q = select(table)
|
150
|
+
q = q.where(table.c.canonical_id == entity_id)
|
151
|
+
statements: List[Statement] = []
|
152
|
+
cursor = self.conn.execute(q)
|
153
|
+
for row in cursor.fetchall():
|
154
|
+
statements.append(Statement.from_db_row(row))
|
155
|
+
|
156
|
+
q_delete = delete(table)
|
157
|
+
q_delete = q_delete.where(table.c.canonical_id == entity_id)
|
158
|
+
self.conn.execute(q_delete)
|
159
|
+
return statements
|
160
|
+
|
161
|
+
|
162
|
+
class SQLView(View[DS, SE]):
|
163
|
+
def __init__(
|
164
|
+
self, store: SQLStore[DS, SE], scope: DS, external: bool = False
|
165
|
+
) -> None:
|
166
|
+
super().__init__(store, scope, external=external)
|
167
|
+
self.store: SQLStore[DS, SE] = store
|
168
|
+
|
169
|
+
def get_entity(self, id: str) -> Optional[SE]:
|
170
|
+
table = self.store.table
|
171
|
+
q = select(table)
|
172
|
+
q = q.where(table.c.canonical_id == id)
|
173
|
+
q = q.where(table.c.dataset.in_(self.dataset_names))
|
174
|
+
for proxy in self.store._iterate(q, stream=False):
|
175
|
+
return proxy
|
176
|
+
return None
|
177
|
+
|
178
|
+
def has_entity(self, id: str) -> bool:
|
179
|
+
table = self.store.table
|
180
|
+
q = select(func.count(table.c.id))
|
181
|
+
q = q.where(table.c.canonical_id == id)
|
182
|
+
q = q.where(table.c.dataset.in_(self.dataset_names))
|
183
|
+
with self.store.engine.connect() as conn:
|
184
|
+
cursor = conn.execute(q)
|
185
|
+
count = cursor.scalar()
|
186
|
+
if count is not None and count > 0:
|
187
|
+
return True
|
188
|
+
else:
|
189
|
+
return False
|
190
|
+
|
191
|
+
def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
|
192
|
+
table = self.store.table
|
193
|
+
id_ = Identifier.get(id)
|
194
|
+
ids = [i.id for i in self.store.linker.connected(id_)]
|
195
|
+
q = select(table.c.canonical_id)
|
196
|
+
q = q.where(table.c.prop_type == "entity")
|
197
|
+
q = q.where(table.c.value.in_(ids))
|
198
|
+
q = q.where(table.c.dataset.in_(self.dataset_names))
|
199
|
+
q = q.group_by(table.c.canonical_id)
|
200
|
+
with self.store.engine.connect() as conn:
|
201
|
+
cursor = conn.execute(q)
|
202
|
+
for (canonical_id,) in cursor.fetchall():
|
203
|
+
if canonical_id is None:
|
204
|
+
continue
|
205
|
+
entity = self.get_entity(canonical_id)
|
206
|
+
if entity is not None:
|
207
|
+
for prop, value in entity.itervalues():
|
208
|
+
if value == id and prop.reverse is not None:
|
209
|
+
yield prop.reverse, entity
|
210
|
+
|
211
|
+
def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
|
212
|
+
table: Table = self.store.table
|
213
|
+
q = select(table)
|
214
|
+
q = q.where(table.c.dataset.in_(self.dataset_names))
|
215
|
+
q = q.order_by(table.c.canonical_id)
|
216
|
+
for entity in self.store._iterate(q, stream=True):
|
217
|
+
if include_schemata is not None and entity.schema not in include_schemata:
|
218
|
+
continue
|
219
|
+
yield entity
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import orjson
|
2
|
+
|
3
|
+
from followthemoney import Statement
|
4
|
+
from followthemoney.statement.util import pack_prop, unpack_prop
|
5
|
+
|
6
|
+
|
7
|
+
def pack_statement(stmt: Statement) -> bytes:
|
8
|
+
values = (
|
9
|
+
stmt.id,
|
10
|
+
stmt.entity_id,
|
11
|
+
stmt.dataset,
|
12
|
+
pack_prop(stmt.schema, stmt.prop),
|
13
|
+
stmt.value,
|
14
|
+
stmt.lang,
|
15
|
+
stmt.original_value,
|
16
|
+
stmt.first_seen,
|
17
|
+
stmt.last_seen,
|
18
|
+
)
|
19
|
+
return orjson.dumps(values)
|
20
|
+
|
21
|
+
|
22
|
+
def unpack_statement(data: bytes, canonical_id: str, external: bool) -> Statement:
|
23
|
+
(
|
24
|
+
id,
|
25
|
+
entity_id,
|
26
|
+
dataset,
|
27
|
+
prop_id,
|
28
|
+
value,
|
29
|
+
lang,
|
30
|
+
original_value,
|
31
|
+
first_seen,
|
32
|
+
last_seen,
|
33
|
+
) = orjson.loads(data)
|
34
|
+
schema, _, prop = unpack_prop(prop_id)
|
35
|
+
return Statement(
|
36
|
+
id=id,
|
37
|
+
entity_id=entity_id,
|
38
|
+
prop=prop,
|
39
|
+
schema=schema,
|
40
|
+
value=value,
|
41
|
+
lang=lang,
|
42
|
+
dataset=dataset,
|
43
|
+
original_value=original_value,
|
44
|
+
first_seen=first_seen,
|
45
|
+
last_seen=last_seen,
|
46
|
+
canonical_id=canonical_id,
|
47
|
+
external=external,
|
48
|
+
)
|
@@ -0,0 +1,371 @@
|
|
1
|
+
import orjson
|
2
|
+
import logging
|
3
|
+
from redis.client import Redis
|
4
|
+
from typing import Generator, List, Optional, Set, Tuple, Dict
|
5
|
+
from followthemoney import DS, SE, Schema, registry, Property, Statement
|
6
|
+
from followthemoney.statement.util import pack_prop, unpack_prop
|
7
|
+
|
8
|
+
from nomenklatura.kv import b, bv, get_redis, close_redis
|
9
|
+
from nomenklatura.versions import Version
|
10
|
+
from nomenklatura.resolver import Linker, Identifier, StrIdent
|
11
|
+
from nomenklatura.store.base import Store, View, Writer
|
12
|
+
|
13
|
+
log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def _pack_statement(stmt: Statement) -> bytes:
|
17
|
+
values = (
|
18
|
+
stmt.id,
|
19
|
+
stmt.entity_id,
|
20
|
+
stmt.dataset,
|
21
|
+
pack_prop(stmt.schema, stmt.prop),
|
22
|
+
stmt.value,
|
23
|
+
stmt.lang or 0,
|
24
|
+
stmt.original_value or 0,
|
25
|
+
stmt.first_seen,
|
26
|
+
stmt.last_seen,
|
27
|
+
1 if stmt.external else 0,
|
28
|
+
)
|
29
|
+
return orjson.dumps(values)
|
30
|
+
|
31
|
+
|
32
|
+
def _unpack_statement(data: bytes, canonical_id: Optional[str] = None) -> Statement:
|
33
|
+
(
|
34
|
+
id,
|
35
|
+
entity_id,
|
36
|
+
dataset,
|
37
|
+
prop_id,
|
38
|
+
value,
|
39
|
+
lang,
|
40
|
+
original_value,
|
41
|
+
first_seen,
|
42
|
+
last_seen,
|
43
|
+
external,
|
44
|
+
) = orjson.loads(data)
|
45
|
+
schema, _, prop = unpack_prop(prop_id)
|
46
|
+
return Statement(
|
47
|
+
id=id,
|
48
|
+
entity_id=entity_id,
|
49
|
+
prop=prop,
|
50
|
+
schema=schema,
|
51
|
+
value=value,
|
52
|
+
lang=None if lang == 0 else lang,
|
53
|
+
dataset=dataset,
|
54
|
+
original_value=None if original_value == 0 else original_value,
|
55
|
+
first_seen=first_seen,
|
56
|
+
last_seen=last_seen,
|
57
|
+
canonical_id=canonical_id or entity_id,
|
58
|
+
external=external == 1,
|
59
|
+
)
|
60
|
+
|
61
|
+
|
62
|
+
class VersionedRedisStore(Store[DS, SE]):
|
63
|
+
def __init__(
|
64
|
+
self,
|
65
|
+
dataset: DS,
|
66
|
+
linker: Linker[SE],
|
67
|
+
db: Optional["Redis[bytes]"] = None,
|
68
|
+
):
|
69
|
+
super().__init__(dataset, linker)
|
70
|
+
if db is None:
|
71
|
+
db = get_redis()
|
72
|
+
self.db = db
|
73
|
+
|
74
|
+
def writer(
|
75
|
+
self,
|
76
|
+
dataset: Optional[DS] = None,
|
77
|
+
version: Optional[str] = None,
|
78
|
+
timestamps: bool = False,
|
79
|
+
) -> "VersionedRedisWriter[DS, SE]":
|
80
|
+
if version is None:
|
81
|
+
version = Version.new().id
|
82
|
+
dataset = dataset or self.dataset
|
83
|
+
return VersionedRedisWriter(
|
84
|
+
self,
|
85
|
+
dataset=dataset,
|
86
|
+
version=version,
|
87
|
+
timestamps=timestamps,
|
88
|
+
)
|
89
|
+
|
90
|
+
def view(
|
91
|
+
self, scope: DS, external: bool = False, versions: Dict[str, str] = {}
|
92
|
+
) -> "VersionedRedisView[DS, SE]":
|
93
|
+
return VersionedRedisView(self, scope, external=external, versions=versions)
|
94
|
+
|
95
|
+
def update(self, id: StrIdent) -> None:
|
96
|
+
# Noop because the VersionedStore is not resolved.
|
97
|
+
return
|
98
|
+
|
99
|
+
def get_latest(self, dataset: str) -> Optional[str]:
|
100
|
+
"""Get the latest version of a dataset in the store."""
|
101
|
+
val = self.db.get(b(f"ds:{dataset}:latest"))
|
102
|
+
return val.decode("utf-8") if val is not None else None
|
103
|
+
|
104
|
+
def get_history(self, dataset: str) -> List[str]:
|
105
|
+
"""List all versions of a dataset present in the store."""
|
106
|
+
values = self.db.lrange(f"ds:{dataset}:history", 0, -1)
|
107
|
+
return [v.decode("utf-8") for v in values]
|
108
|
+
|
109
|
+
def has_version(self, dataset: str, version: str) -> bool:
|
110
|
+
"""Check if a specific version of a dataset exists in the store."""
|
111
|
+
return self.db.exists(f"ents:{dataset}:{version}") > 0
|
112
|
+
|
113
|
+
def release_version(self, dataset: str, version: str) -> None:
|
114
|
+
"""Release the given version of the dataset (i.e. tag it as the latest
|
115
|
+
version in the relevant lookup key)."""
|
116
|
+
history_key = b(f"ds:{dataset}:history")
|
117
|
+
idx = self.db.lpos(history_key, b(version))
|
118
|
+
if idx is None:
|
119
|
+
self.db.lpush(history_key, b(version))
|
120
|
+
latest = self.db.lindex(history_key, 0)
|
121
|
+
if latest is not None:
|
122
|
+
self.db.set(b(f"ds:{dataset}:latest"), latest)
|
123
|
+
log.info("Released store version: %s (%s)", dataset, version)
|
124
|
+
|
125
|
+
def drop_version(self, dataset: str, version: str) -> None:
|
126
|
+
"""Delete all data associated with a specific version of a dataset."""
|
127
|
+
pipeline = self.db.pipeline()
|
128
|
+
cmds = 0
|
129
|
+
for prefix in ["stmt", "ents", "inv"]:
|
130
|
+
query = f"{prefix}:{dataset}:{version}*"
|
131
|
+
for key in self.db.scan_iter(query):
|
132
|
+
pipeline.delete(key)
|
133
|
+
cmds += 1
|
134
|
+
if cmds > 1_000:
|
135
|
+
pipeline.execute()
|
136
|
+
pipeline = self.db.pipeline()
|
137
|
+
cmds = 0
|
138
|
+
if cmds > 0:
|
139
|
+
pipeline.execute()
|
140
|
+
|
141
|
+
# TODO: do we even want to remove the version from the history list?
|
142
|
+
self.db.lrem(f"ds:{dataset}:history", 0, b(version))
|
143
|
+
latest_key = f"ds:{dataset}:latest"
|
144
|
+
if b(version) == self.db.get(latest_key):
|
145
|
+
previous = self.db.lindex(b(f"ds:{dataset}:history"), 0)
|
146
|
+
if previous is not None:
|
147
|
+
self.db.set(latest_key, previous)
|
148
|
+
else:
|
149
|
+
self.db.delete(latest_key)
|
150
|
+
log.info("Dropped store version: %s (%s)", dataset, version)
|
151
|
+
|
152
|
+
def close(self) -> None:
|
153
|
+
close_redis()
|
154
|
+
|
155
|
+
|
156
|
+
class VersionedRedisWriter(Writer[DS, SE]):
|
157
|
+
BATCH_STATEMENTS = 2_000
|
158
|
+
|
159
|
+
def __init__(
|
160
|
+
self,
|
161
|
+
store: VersionedRedisStore[DS, SE],
|
162
|
+
dataset: DS,
|
163
|
+
version: str,
|
164
|
+
timestamps: bool = False,
|
165
|
+
):
|
166
|
+
self.version = version
|
167
|
+
self.dataset = dataset
|
168
|
+
self.timestamps = timestamps
|
169
|
+
self.ver = f"{dataset.name}:{version}"
|
170
|
+
self.store: VersionedRedisStore[DS, SE] = store
|
171
|
+
self.prev = store.get_latest(dataset.name)
|
172
|
+
self.buffer: List[Statement] = []
|
173
|
+
|
174
|
+
def __enter__(self) -> "VersionedRedisWriter[DS, SE]":
|
175
|
+
return self
|
176
|
+
|
177
|
+
def flush(self) -> None:
|
178
|
+
db = self.store.db
|
179
|
+
pipeline = db.pipeline()
|
180
|
+
|
181
|
+
statements: Dict[str, Set[Statement]] = {}
|
182
|
+
for stmt in self.buffer:
|
183
|
+
if stmt.entity_id not in statements:
|
184
|
+
statements[stmt.entity_id] = set()
|
185
|
+
statements[stmt.entity_id].add(stmt)
|
186
|
+
|
187
|
+
if len(statements) == 0:
|
188
|
+
return
|
189
|
+
|
190
|
+
# Merge with previous version to get accurate first_seen timestamps
|
191
|
+
if self.timestamps and self.prev:
|
192
|
+
keys = [b(f"stmt:{self.prev}:{e}") for e in statements.keys()]
|
193
|
+
for v in db.sunion(keys):
|
194
|
+
pstmt = _unpack_statement(bv(v))
|
195
|
+
for stmt in self.buffer:
|
196
|
+
if pstmt.id == stmt.id:
|
197
|
+
stmt.first_seen = pstmt.first_seen
|
198
|
+
break
|
199
|
+
|
200
|
+
for entity_id, stmts in statements.items():
|
201
|
+
b_entity_id = b(entity_id)
|
202
|
+
pipeline.sadd(b(f"ents:{self.ver}"), b_entity_id)
|
203
|
+
values = [_pack_statement(s) for s in stmts]
|
204
|
+
pipeline.sadd(f"stmt:{self.ver}:{entity_id}", *values)
|
205
|
+
|
206
|
+
for stmt in stmts:
|
207
|
+
if stmt.prop_type == registry.entity.name:
|
208
|
+
pipeline.sadd(b(f"inv:{self.ver}:{stmt.value}"), b_entity_id)
|
209
|
+
|
210
|
+
pipeline.execute()
|
211
|
+
self.buffer = []
|
212
|
+
|
213
|
+
def release(self) -> None:
|
214
|
+
"""Release the current version of the dataset (i.e. tag it as the latest
|
215
|
+
version in the relevant lookup key)."""
|
216
|
+
self.store.release_version(self.dataset.name, self.version)
|
217
|
+
|
218
|
+
def close(self) -> None:
|
219
|
+
self.release()
|
220
|
+
self.store.close()
|
221
|
+
|
222
|
+
def add_statement(self, stmt: Statement) -> None:
|
223
|
+
if stmt.entity_id is None:
|
224
|
+
return
|
225
|
+
self.buffer.append(stmt)
|
226
|
+
if len(self.buffer) >= self.BATCH_STATEMENTS:
|
227
|
+
self.flush()
|
228
|
+
|
229
|
+
def pop(self, entity_id: str) -> List[Statement]:
|
230
|
+
raise NotImplementedError()
|
231
|
+
|
232
|
+
|
233
|
+
class VersionedRedisView(View[DS, SE]):
|
234
|
+
def __init__(
|
235
|
+
self,
|
236
|
+
store: VersionedRedisStore[DS, SE],
|
237
|
+
scope: DS,
|
238
|
+
external: bool = False,
|
239
|
+
versions: Dict[str, str] = {},
|
240
|
+
) -> None:
|
241
|
+
super().__init__(store, scope, external=external)
|
242
|
+
self.store: VersionedRedisStore[DS, SE] = store
|
243
|
+
|
244
|
+
# Get the latest version for each dataset in the scope
|
245
|
+
self.vers: List[Tuple[str, str]] = []
|
246
|
+
for ds in scope.leaf_names:
|
247
|
+
version = versions.get(ds, self.store.get_latest(ds))
|
248
|
+
if version is not None:
|
249
|
+
self.vers.append((ds, version))
|
250
|
+
|
251
|
+
def _get_stmt_keys(self, entity_id: str) -> List[str]:
|
252
|
+
keys: List[str] = []
|
253
|
+
ident = Identifier.get(entity_id)
|
254
|
+
for id in self.store.linker.connected(ident):
|
255
|
+
keys.extend([f"stmt:{d}:{v}:{id}" for d, v in self.vers])
|
256
|
+
return keys
|
257
|
+
|
258
|
+
def has_entity(self, id: str) -> bool:
|
259
|
+
# FIXME: this implementation does not account for the `external` flag
|
260
|
+
# correctly because it does not check the `stmt.external` field for
|
261
|
+
# each statement.
|
262
|
+
return self.store.db.exists(*self._get_stmt_keys(id)) > 0
|
263
|
+
|
264
|
+
def _get_statements(self, id: str) -> Generator[Statement, None, None]:
|
265
|
+
keys = self._get_stmt_keys(id)
|
266
|
+
if len(keys) == 0:
|
267
|
+
return None
|
268
|
+
elif len(keys) == 1:
|
269
|
+
stmts = self.store.db.smembers(keys[0])
|
270
|
+
else:
|
271
|
+
stmts = {bv(s) for s in self.store.db.sunion(keys)}
|
272
|
+
for v in stmts:
|
273
|
+
stmt = _unpack_statement(bv(v), id)
|
274
|
+
yield stmt
|
275
|
+
|
276
|
+
def get_timestamps(self, id: str) -> Dict[str, str]:
|
277
|
+
"""Get the first seen timestamps associated with all statements of an entity.
|
278
|
+
|
279
|
+
Returns a dictionary mapping statement IDs to their first seen timestamps.
|
280
|
+
This can be used by an ETL to generate continuous entity histories.
|
281
|
+
"""
|
282
|
+
timestamps: Dict[str, str] = {}
|
283
|
+
for stmt in self._get_statements(id):
|
284
|
+
if stmt.id is not None and stmt.first_seen is not None:
|
285
|
+
timestamps[stmt.id] = stmt.first_seen
|
286
|
+
return timestamps
|
287
|
+
|
288
|
+
def get_entity(self, id: str) -> Optional[SE]:
|
289
|
+
statements: List[Statement] = []
|
290
|
+
for stmt in self._get_statements(id):
|
291
|
+
if not stmt.external or self.external:
|
292
|
+
stmt.canonical_id = self.store.linker.get_canonical(stmt.entity_id)
|
293
|
+
if stmt.prop_type == registry.entity.name:
|
294
|
+
stmt.value = self.store.linker.get_canonical(stmt.value)
|
295
|
+
statements.append(stmt)
|
296
|
+
return self.store.assemble(statements)
|
297
|
+
|
298
|
+
def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
|
299
|
+
keys: List[str] = []
|
300
|
+
ident = Identifier.get(id)
|
301
|
+
for ent_id in self.store.linker.connected(ident):
|
302
|
+
keys.extend([f"inv:{d}:{v}:{ent_id}" for d, v in self.vers])
|
303
|
+
refs = (
|
304
|
+
{bv(v) for v in self.store.db.sunion(keys)}
|
305
|
+
if len(keys) > 0
|
306
|
+
else self.store.db.smembers(keys[0])
|
307
|
+
)
|
308
|
+
entities: Set[str] = set()
|
309
|
+
for v in refs:
|
310
|
+
entity_id = v.decode("utf-8")
|
311
|
+
entities.add(self.store.linker.get_canonical(entity_id))
|
312
|
+
for entity_id in entities:
|
313
|
+
entity = self.get_entity(entity_id)
|
314
|
+
if entity is None:
|
315
|
+
continue
|
316
|
+
for prop, value in entity.itervalues():
|
317
|
+
if value == id and prop.reverse is not None:
|
318
|
+
yield prop.reverse, entity
|
319
|
+
|
320
|
+
def statements(self, resolve: bool = False) -> Generator[Statement, None, None]:
|
321
|
+
"""Iterate over all statements in the view. If `resolve` is set to `True`,
|
322
|
+
canonical IDs are applied to the statement and its value.
|
323
|
+
|
324
|
+
NOTE: The `external` flag of the view will be used to filter statements, too.
|
325
|
+
"""
|
326
|
+
for ds, ver in self.vers:
|
327
|
+
for id in self.store.db.sscan_iter(b(f"ents:{ds}:{ver}")):
|
328
|
+
entity_id = id.decode("utf-8")
|
329
|
+
stmt_key = f"stmt:{ds}:{ver}:{entity_id}"
|
330
|
+
for stmt_text in self.store.db.smembers(b(stmt_key)):
|
331
|
+
stmt = _unpack_statement(stmt_text, entity_id)
|
332
|
+
if stmt.external and not self.external:
|
333
|
+
continue
|
334
|
+
if resolve:
|
335
|
+
stmt = self.store.linker.apply_statement(stmt)
|
336
|
+
yield stmt
|
337
|
+
|
338
|
+
def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
|
339
|
+
if len(self.vers) == 0:
|
340
|
+
return
|
341
|
+
if len(self.vers) == 1:
|
342
|
+
scope_name = b(f"ents:{self.vers[0][0]}:{self.vers[0][1]}")
|
343
|
+
else:
|
344
|
+
version = Version.new().id + ":iter"
|
345
|
+
scope_name = b(f"ents:{self.scope.name}:{version}")
|
346
|
+
parts = [b(f"ents:{d}:{v}") for d, v in self.vers]
|
347
|
+
self.store.db.sunionstore(scope_name, parts)
|
348
|
+
|
349
|
+
# Keep track of canonical entities to avoid yielding the same
|
350
|
+
# de-duplicated entity multiple times. This intrinsically leaks
|
351
|
+
# memory, so we're being careful to only record entity IDs
|
352
|
+
# that are part of a cluster with more than one ID.
|
353
|
+
try:
|
354
|
+
seen: Set[str] = set()
|
355
|
+
for id in self.store.db.sscan_iter(scope_name):
|
356
|
+
entity_id = id.decode("utf-8")
|
357
|
+
ident = Identifier.get(entity_id)
|
358
|
+
connected = self.store.linker.connected(ident)
|
359
|
+
if len(connected) > 1:
|
360
|
+
canonical_id = max(connected).id
|
361
|
+
if canonical_id in seen:
|
362
|
+
continue
|
363
|
+
seen.add(canonical_id)
|
364
|
+
entity = self.get_entity(entity_id)
|
365
|
+
if entity is not None:
|
366
|
+
if include_schemata is not None and entity.schema not in include_schemata:
|
367
|
+
continue
|
368
|
+
yield entity
|
369
|
+
finally:
|
370
|
+
if len(self.vers) > 1:
|
371
|
+
self.store.db.delete(scope_name)
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
from followthemoney import DS, SE
|
3
|
+
|
4
|
+
from nomenklatura.store import Store
|
5
|
+
|
6
|
+
from nomenklatura.tui.app import DedupeApp, DedupeState
|
7
|
+
from nomenklatura.resolver import Resolver
|
8
|
+
|
9
|
+
__all__ = ["dedupe_ui"]
|
10
|
+
|
11
|
+
|
12
|
+
def dedupe_ui(
|
13
|
+
resolver: Resolver[SE], store: Store[DS, SE], url_base: Optional[str] = None
|
14
|
+
) -> None:
|
15
|
+
app = DedupeApp()
|
16
|
+
app.dedupe = DedupeState(resolver, store, url_base=url_base)
|
17
|
+
app.run()
|