nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
from types import TracebackType
|
2
|
+
from typing import Optional, Generator, List, Tuple, Generic, Type, cast
|
3
|
+
from followthemoney import Schema, registry, Property, DS, Statement
|
4
|
+
from followthemoney import StatementEntity, SE
|
5
|
+
from followthemoney.statement.util import get_prop_type
|
6
|
+
|
7
|
+
from nomenklatura.resolver import Linker, StrIdent
|
8
|
+
|
9
|
+
|
10
|
+
class Store(Generic[DS, SE]):
|
11
|
+
"""A data storage and retrieval mechanism for statement-based entity data.
|
12
|
+
Essentially, this is a triple store which can be implemented using various
|
13
|
+
backends."""
|
14
|
+
|
15
|
+
def __init__(self, dataset: DS, linker: Linker[SE]):
|
16
|
+
self.dataset = dataset
|
17
|
+
self.linker = linker
|
18
|
+
self.entity_class = cast(Type[SE], StatementEntity)
|
19
|
+
|
20
|
+
def writer(self) -> "Writer[DS, SE]":
|
21
|
+
raise NotImplementedError()
|
22
|
+
|
23
|
+
def view(self, scope: DS, external: bool = False) -> "View[DS, SE]":
|
24
|
+
raise NotImplementedError()
|
25
|
+
|
26
|
+
def default_view(self, external: bool = False) -> "View[DS, SE]":
|
27
|
+
return self.view(self.dataset, external=external)
|
28
|
+
|
29
|
+
def assemble(self, statements: List[Statement]) -> Optional[SE]:
|
30
|
+
if not len(statements):
|
31
|
+
return None
|
32
|
+
for stmt in statements:
|
33
|
+
if get_prop_type(stmt.schema, stmt.prop) == registry.entity.name:
|
34
|
+
stmt.value = self.linker.get_canonical(stmt.value)
|
35
|
+
entity = self.entity_class.from_statements(self.dataset, statements)
|
36
|
+
if entity.id is not None:
|
37
|
+
entity.extra_referents.update(self.linker.get_referents(entity.id))
|
38
|
+
return entity
|
39
|
+
|
40
|
+
def update(self, id: StrIdent) -> None:
|
41
|
+
canonical_id = self.linker.get_canonical(id)
|
42
|
+
with self.writer() as writer:
|
43
|
+
for referent in self.linker.get_referents(canonical_id):
|
44
|
+
for stmt in writer.pop(referent):
|
45
|
+
stmt.canonical_id = canonical_id
|
46
|
+
writer.add_statement(stmt)
|
47
|
+
|
48
|
+
def close(self) -> None:
|
49
|
+
pass
|
50
|
+
|
51
|
+
def __repr__(self) -> str:
|
52
|
+
return f"<{type(self).__name__}({self.dataset.name!r})>"
|
53
|
+
|
54
|
+
|
55
|
+
class Writer(Generic[DS, SE]):
|
56
|
+
"""Bulk writing operations."""
|
57
|
+
|
58
|
+
def __init__(self, store: Store[DS, SE]):
|
59
|
+
self.store = store
|
60
|
+
|
61
|
+
def add_statement(self, stmt: Statement) -> None:
|
62
|
+
raise NotImplementedError()
|
63
|
+
|
64
|
+
def add_entity(self, entity: SE) -> None:
|
65
|
+
for stmt in entity.statements:
|
66
|
+
self.add_statement(stmt)
|
67
|
+
|
68
|
+
def pop(self, entity_id: str) -> List[Statement]:
|
69
|
+
raise NotImplementedError()
|
70
|
+
|
71
|
+
def flush(self) -> None:
|
72
|
+
pass
|
73
|
+
|
74
|
+
def close(self) -> None:
|
75
|
+
self.store.close()
|
76
|
+
|
77
|
+
def __enter__(self) -> "Writer[DS, SE]":
|
78
|
+
return self
|
79
|
+
|
80
|
+
def __exit__(
|
81
|
+
self,
|
82
|
+
type: Optional[Type[BaseException]],
|
83
|
+
value: Optional[BaseException],
|
84
|
+
traceback: Optional[TracebackType],
|
85
|
+
) -> None:
|
86
|
+
self.flush()
|
87
|
+
|
88
|
+
def __repr__(self) -> str:
|
89
|
+
return f"<{type(self).__name__}({self.store!r})>"
|
90
|
+
|
91
|
+
|
92
|
+
class View(Generic[DS, SE]):
|
93
|
+
def __init__(self, store: Store[DS, SE], scope: DS, external: bool = False):
|
94
|
+
self.store = store
|
95
|
+
self.scope = scope
|
96
|
+
self.dataset_names = scope.leaf_names
|
97
|
+
self.external = external
|
98
|
+
|
99
|
+
def has_entity(self, id: str) -> bool:
|
100
|
+
raise NotImplementedError()
|
101
|
+
|
102
|
+
def get_entity(self, id: str) -> Optional[SE]:
|
103
|
+
raise NotImplementedError()
|
104
|
+
|
105
|
+
def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
|
106
|
+
raise NotImplementedError()
|
107
|
+
|
108
|
+
def get_adjacent(
|
109
|
+
self, entity: SE, inverted: bool = True
|
110
|
+
) -> Generator[Tuple[Property, SE], None, None]:
|
111
|
+
for prop, value in entity.itervalues():
|
112
|
+
if prop.type == registry.entity:
|
113
|
+
child = self.get_entity(value)
|
114
|
+
if child is not None:
|
115
|
+
yield prop, child
|
116
|
+
|
117
|
+
if inverted and entity.id is not None:
|
118
|
+
for prop, adjacent in self.get_inverted(entity.id):
|
119
|
+
yield prop, adjacent
|
120
|
+
|
121
|
+
def entities(self, include_schemata: List[Schema] = []) -> Generator[SE, None, None]:
|
122
|
+
"""Iterate over all entities in the view.
|
123
|
+
|
124
|
+
If `include_schemata` is provided, only entities of the provided schemata will be returned.
|
125
|
+
Note that `schemata` will not be expanded via "is_a" relationships."""
|
126
|
+
|
127
|
+
raise NotImplementedError()
|
128
|
+
|
129
|
+
def __repr__(self) -> str:
|
130
|
+
return f"<{type(self).__name__}({self.scope.name!r})>"
|
@@ -0,0 +1,272 @@
|
|
1
|
+
#
|
2
|
+
# LevelDB-based store for Nomenklatura.
|
3
|
+
# A lot of the code in this module is extremely performance-sensitive, so it is unrolled and
|
4
|
+
# doesn't use helper functions in some places where it would otherwise be more readable.
|
5
|
+
#
|
6
|
+
# Specific examples:
|
7
|
+
# * Not calling a helper to byte-encode values.
|
8
|
+
# * Not having a helper method for building entities.
|
9
|
+
import gc
|
10
|
+
import orjson
|
11
|
+
import logging
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import Any, Generator, List, Optional, Set, Tuple
|
14
|
+
from rigour.env import ENCODING as E
|
15
|
+
|
16
|
+
import plyvel # type: ignore
|
17
|
+
from followthemoney import model, DS, SE, Schema, registry, Property, Statement
|
18
|
+
from followthemoney.exc import InvalidData
|
19
|
+
from followthemoney.statement.util import get_prop_type
|
20
|
+
|
21
|
+
from nomenklatura.resolver import Linker
|
22
|
+
from nomenklatura.store.base import Store, View, Writer
|
23
|
+
|
24
|
+
log = logging.getLogger(__name__)
|
25
|
+
MAX_OPEN_FILES = 1000
|
26
|
+
|
27
|
+
|
28
|
+
def unpack_statement(
|
29
|
+
keys: List[str],
|
30
|
+
data: bytes,
|
31
|
+
) -> Statement:
|
32
|
+
_, canonical_id, ext, dataset, schema, stmt_id = keys
|
33
|
+
(
|
34
|
+
entity_id,
|
35
|
+
prop,
|
36
|
+
value,
|
37
|
+
lang,
|
38
|
+
original_value,
|
39
|
+
origin,
|
40
|
+
first_seen,
|
41
|
+
last_seen,
|
42
|
+
) = orjson.loads(data)
|
43
|
+
return Statement(
|
44
|
+
id=stmt_id,
|
45
|
+
entity_id=entity_id,
|
46
|
+
prop=prop,
|
47
|
+
schema=schema,
|
48
|
+
value=value,
|
49
|
+
lang=None if lang == 0 else lang,
|
50
|
+
dataset=dataset,
|
51
|
+
original_value=None if original_value == 0 else original_value,
|
52
|
+
origin=None if origin == 0 else origin,
|
53
|
+
first_seen=first_seen,
|
54
|
+
last_seen=last_seen,
|
55
|
+
canonical_id=canonical_id,
|
56
|
+
external=ext == "x",
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
class LevelDBStore(Store[DS, SE]):
|
61
|
+
def __init__(self, dataset: DS, linker: Linker[SE], path: Path):
|
62
|
+
super().__init__(dataset, linker)
|
63
|
+
self.path = path
|
64
|
+
self.db = plyvel.DB(
|
65
|
+
path.as_posix(),
|
66
|
+
create_if_missing=True,
|
67
|
+
max_open_files=MAX_OPEN_FILES,
|
68
|
+
)
|
69
|
+
|
70
|
+
def optimize(self) -> None:
|
71
|
+
"""Optimize the database by compacting it."""
|
72
|
+
self.db.compact_range()
|
73
|
+
self.db.close()
|
74
|
+
gc.collect()
|
75
|
+
self.db = plyvel.DB(
|
76
|
+
self.path.as_posix(),
|
77
|
+
create_if_missing=False,
|
78
|
+
max_open_files=MAX_OPEN_FILES,
|
79
|
+
)
|
80
|
+
log.info("Optimized LevelDB at %s", self.path)
|
81
|
+
|
82
|
+
def writer(self) -> Writer[DS, SE]:
|
83
|
+
return LevelDBWriter(self)
|
84
|
+
|
85
|
+
def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
|
86
|
+
return LevelDBView(self, scope, external=external)
|
87
|
+
|
88
|
+
def close(self) -> None:
|
89
|
+
self.db.close()
|
90
|
+
|
91
|
+
|
92
|
+
class LevelDBWriter(Writer[DS, SE]):
|
93
|
+
BATCH_STATEMENTS = 100_000
|
94
|
+
|
95
|
+
def __init__(self, store: LevelDBStore[DS, SE]):
|
96
|
+
self.store: LevelDBStore[DS, SE] = store
|
97
|
+
self.batch: Optional[Any] = None
|
98
|
+
self.batch_size = 0
|
99
|
+
|
100
|
+
def flush(self) -> None:
|
101
|
+
if self.batch is not None:
|
102
|
+
self.batch.write()
|
103
|
+
self.batch = None
|
104
|
+
self.batch_size = 0
|
105
|
+
|
106
|
+
def add_statement(self, stmt: Statement) -> None:
|
107
|
+
if stmt.entity_id is None:
|
108
|
+
return
|
109
|
+
if self.batch_size >= self.BATCH_STATEMENTS:
|
110
|
+
self.flush()
|
111
|
+
if self.batch is None:
|
112
|
+
self.batch = self.store.db.write_batch()
|
113
|
+
canonical_id = self.store.linker.get_canonical(stmt.entity_id)
|
114
|
+
stmt.canonical_id = canonical_id
|
115
|
+
|
116
|
+
ext = "x" if stmt.external else ""
|
117
|
+
key = f"s:{canonical_id}:{ext}:{stmt.dataset}:{stmt.schema}:{stmt.id}".encode(E)
|
118
|
+
values = (
|
119
|
+
stmt.entity_id,
|
120
|
+
stmt.prop,
|
121
|
+
stmt.value,
|
122
|
+
stmt.lang or 0,
|
123
|
+
stmt.original_value or 0,
|
124
|
+
stmt.origin or 0,
|
125
|
+
stmt.first_seen,
|
126
|
+
stmt.last_seen,
|
127
|
+
)
|
128
|
+
data = orjson.dumps(values)
|
129
|
+
self.batch.put(key, data)
|
130
|
+
if get_prop_type(stmt.schema, stmt.prop) == registry.entity.name:
|
131
|
+
vc = self.store.linker.get_canonical(stmt.value)
|
132
|
+
key = f"i:{vc}:{stmt.canonical_id}".encode(E)
|
133
|
+
self.batch.put(key, b"")
|
134
|
+
|
135
|
+
self.batch_size += 1
|
136
|
+
|
137
|
+
def pop(self, entity_id: str) -> List[Statement]:
|
138
|
+
if self.batch_size >= self.BATCH_STATEMENTS:
|
139
|
+
self.flush()
|
140
|
+
if self.batch is None:
|
141
|
+
self.batch = self.store.db.write_batch()
|
142
|
+
statements: List[Statement] = []
|
143
|
+
datasets: Set[str] = set()
|
144
|
+
prefix = f"s:{entity_id}:".encode(E)
|
145
|
+
with self.store.db.iterator(prefix=prefix) as it:
|
146
|
+
for k, v in it:
|
147
|
+
self.batch.delete(k)
|
148
|
+
stmt = unpack_statement(k.decode(E).split(":"), v)
|
149
|
+
statements.append(stmt)
|
150
|
+
datasets.add(stmt.dataset)
|
151
|
+
|
152
|
+
if stmt.prop_type == registry.entity.name:
|
153
|
+
vc = self.store.linker.get_canonical(stmt.value)
|
154
|
+
self.batch.delete(f"i:{vc}:{entity_id}".encode(E))
|
155
|
+
return list(statements)
|
156
|
+
|
157
|
+
|
158
|
+
class LevelDBView(View[DS, SE]):
|
159
|
+
def __init__(
|
160
|
+
self, store: LevelDBStore[DS, SE], scope: DS, external: bool = False
|
161
|
+
) -> None:
|
162
|
+
super().__init__(store, scope, external=external)
|
163
|
+
self.store: LevelDBStore[DS, SE] = store
|
164
|
+
self.dataset_names: Set[str] = set(scope.dataset_names)
|
165
|
+
|
166
|
+
def has_entity(self, id: str) -> bool:
|
167
|
+
prefix = f"s:{id}:".encode(E)
|
168
|
+
with self.store.db.iterator(prefix=prefix, include_value=False) as it:
|
169
|
+
for v in it:
|
170
|
+
_, _, ext, dataset, _, _ = v.decode(E).split(":")
|
171
|
+
if dataset not in self.dataset_names:
|
172
|
+
continue
|
173
|
+
if ext == "x" and not self.external:
|
174
|
+
continue
|
175
|
+
return True
|
176
|
+
return False
|
177
|
+
|
178
|
+
def get_entity(self, id: str) -> Optional[SE]:
|
179
|
+
statements: List[Statement] = []
|
180
|
+
prefix = f"s:{id}:".encode(E)
|
181
|
+
with self.store.db.iterator(prefix=prefix) as it:
|
182
|
+
for k, v in it:
|
183
|
+
keys = k.decode(E).split(":")
|
184
|
+
_, _, ext, dataset, _, _ = keys
|
185
|
+
if dataset not in self.dataset_names:
|
186
|
+
continue
|
187
|
+
if ext == "x" and not self.external:
|
188
|
+
continue
|
189
|
+
statements.append(unpack_statement(keys, v))
|
190
|
+
return self.store.assemble(statements)
|
191
|
+
|
192
|
+
def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
|
193
|
+
prefix = f"i:{id}:".encode(E)
|
194
|
+
with self.store.db.iterator(prefix=prefix, include_value=False) as it:
|
195
|
+
for k in it:
|
196
|
+
_, _, ref = k.decode(E).split(":")
|
197
|
+
entity = self.get_entity(ref)
|
198
|
+
if entity is None:
|
199
|
+
continue
|
200
|
+
for prop, value in entity.itervalues():
|
201
|
+
if value == id and prop.reverse is not None:
|
202
|
+
yield prop.reverse, entity
|
203
|
+
|
204
|
+
def entities(
|
205
|
+
self, include_schemata: Optional[List[Schema]] = None
|
206
|
+
) -> Generator[SE, None, None]:
|
207
|
+
with self.store.db.iterator(prefix=b"s:", fill_cache=False) as it:
|
208
|
+
current_id: Optional[str] = None
|
209
|
+
current_schema: Optional[Schema] = None
|
210
|
+
current_fail: bool = False
|
211
|
+
statements: List[Statement] = []
|
212
|
+
for k, v in it:
|
213
|
+
keys = k.decode(E).split(":")
|
214
|
+
_, canonical_id, ext, dataset, schema, _ = keys
|
215
|
+
|
216
|
+
if ext == "x" and not self.external:
|
217
|
+
continue
|
218
|
+
if dataset not in self.dataset_names:
|
219
|
+
continue
|
220
|
+
|
221
|
+
# If we're seeing a new canonical ID, yield the previous entity
|
222
|
+
if canonical_id != current_id:
|
223
|
+
if (
|
224
|
+
include_schemata is not None
|
225
|
+
and current_schema not in include_schemata
|
226
|
+
):
|
227
|
+
statements = []
|
228
|
+
if len(statements) > 0 and not current_fail:
|
229
|
+
entity = self.store.assemble(statements)
|
230
|
+
if entity is not None:
|
231
|
+
yield entity
|
232
|
+
current_id = canonical_id
|
233
|
+
current_schema = None
|
234
|
+
current_fail = False
|
235
|
+
statements = []
|
236
|
+
|
237
|
+
# If we're not filtering on schemata, we can skip the expensive-ish schema building here
|
238
|
+
# The checking is done by store.assemble() anyway
|
239
|
+
if include_schemata is not None:
|
240
|
+
if current_schema is None:
|
241
|
+
current_schema = model.get(schema)
|
242
|
+
# If the statement is of an unknown schema
|
243
|
+
if current_schema is None:
|
244
|
+
log.error("Unknown schema %r: %s", (schema, current_id))
|
245
|
+
# Mark the entity as failed, but we need to iterate through the rest of the statements
|
246
|
+
current_fail = True
|
247
|
+
continue
|
248
|
+
# If the schema of the statement does not exactly match the schema of the current entity,
|
249
|
+
# find a common parent schema.
|
250
|
+
elif current_schema.name != schema:
|
251
|
+
try:
|
252
|
+
current_schema = model.common_schema(current_schema, schema)
|
253
|
+
except InvalidData as inv:
|
254
|
+
msg = "Invalid schema %s for %r: %s" % (
|
255
|
+
schema,
|
256
|
+
current_id,
|
257
|
+
inv,
|
258
|
+
)
|
259
|
+
log.error(msg)
|
260
|
+
# Mark the entity as failed, but we need to iterate through the rest of the statements
|
261
|
+
current_fail = True
|
262
|
+
continue
|
263
|
+
|
264
|
+
statements.append(unpack_statement(keys, v))
|
265
|
+
|
266
|
+
# Handle the last entity at the end of the iterator
|
267
|
+
if include_schemata is not None and current_schema not in include_schemata:
|
268
|
+
statements = []
|
269
|
+
if len(statements) > 0 and not current_fail:
|
270
|
+
entity = self.store.assemble(statements)
|
271
|
+
if entity is not None:
|
272
|
+
yield entity
|
@@ -0,0 +1,102 @@
|
|
1
|
+
from typing import Dict, Set, List, Optional, Generator, Tuple
|
2
|
+
from followthemoney import DS, SE, Schema, registry, Property, Statement
|
3
|
+
|
4
|
+
from nomenklatura.store.base import Store, View, Writer
|
5
|
+
from nomenklatura.resolver import Linker
|
6
|
+
|
7
|
+
|
8
|
+
class MemoryStore(Store[DS, SE]):
|
9
|
+
def __init__(self, dataset: DS, linker: Linker[SE]):
|
10
|
+
super().__init__(dataset, linker)
|
11
|
+
self.stmts: Dict[str, Set[Statement]] = {}
|
12
|
+
self.inverted: Dict[str, Set[str]] = {}
|
13
|
+
self.entities: Dict[str, Set[str]] = {}
|
14
|
+
|
15
|
+
def writer(self) -> Writer[DS, SE]:
|
16
|
+
return MemoryWriter(self)
|
17
|
+
|
18
|
+
def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
|
19
|
+
return MemoryView(self, scope, external=external)
|
20
|
+
|
21
|
+
|
22
|
+
class MemoryWriter(Writer[DS, SE]):
|
23
|
+
def __init__(self, store: MemoryStore[DS, SE]):
|
24
|
+
self.store: MemoryStore[DS, SE] = store
|
25
|
+
|
26
|
+
def add_statement(self, stmt: Statement) -> None:
|
27
|
+
if stmt.entity_id is None:
|
28
|
+
return
|
29
|
+
canonical_id = stmt.canonical_id or self.store.linker.get_canonical(
|
30
|
+
stmt.entity_id
|
31
|
+
)
|
32
|
+
if canonical_id not in self.store.stmts:
|
33
|
+
self.store.stmts[canonical_id] = set()
|
34
|
+
self.store.stmts[canonical_id].add(stmt)
|
35
|
+
|
36
|
+
if stmt.dataset not in self.store.entities:
|
37
|
+
self.store.entities[stmt.dataset] = set()
|
38
|
+
self.store.entities[stmt.dataset].add(canonical_id)
|
39
|
+
|
40
|
+
if stmt.prop_type == registry.entity.name:
|
41
|
+
inverted_id = self.store.linker.get_canonical(stmt.value)
|
42
|
+
if inverted_id not in self.store.inverted:
|
43
|
+
self.store.inverted[inverted_id] = set()
|
44
|
+
self.store.inverted[inverted_id].add(canonical_id)
|
45
|
+
|
46
|
+
def pop(self, entity_id: str) -> List[Statement]:
|
47
|
+
statements = self.store.stmts.pop(entity_id, set())
|
48
|
+
for stmt in statements:
|
49
|
+
if stmt.dataset in self.store.entities:
|
50
|
+
self.store.entities[stmt.dataset].discard(entity_id)
|
51
|
+
|
52
|
+
if stmt.prop_type == registry.entity.name:
|
53
|
+
inverted_id = self.store.linker.get_canonical(stmt.value)
|
54
|
+
if inverted_id in self.store.inverted:
|
55
|
+
self.store.inverted[inverted_id].discard(entity_id)
|
56
|
+
|
57
|
+
return list(statements)
|
58
|
+
|
59
|
+
|
60
|
+
class MemoryView(View[DS, SE]):
|
61
|
+
def __init__(
|
62
|
+
self, store: MemoryStore[DS, SE], scope: DS, external: bool = False
|
63
|
+
) -> None:
|
64
|
+
super().__init__(store, scope, external=external)
|
65
|
+
self.store: MemoryStore[DS, SE] = store
|
66
|
+
|
67
|
+
def has_entity(self, id: str) -> bool:
|
68
|
+
for stmt in self.store.stmts.get(id, []):
|
69
|
+
if self.external is False and stmt.external:
|
70
|
+
continue
|
71
|
+
return True
|
72
|
+
return False
|
73
|
+
|
74
|
+
def get_entity(self, id: str) -> Optional[SE]:
|
75
|
+
if id not in self.store.stmts:
|
76
|
+
return None
|
77
|
+
stmts: List[Statement] = []
|
78
|
+
for stmt in self.store.stmts[id]:
|
79
|
+
if self.external is False and stmt.external:
|
80
|
+
continue
|
81
|
+
stmts.append(stmt)
|
82
|
+
return self.store.assemble(stmts)
|
83
|
+
|
84
|
+
def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
|
85
|
+
for inverted_id in self.store.inverted.get(id, []):
|
86
|
+
entity = self.get_entity(inverted_id)
|
87
|
+
if entity is None:
|
88
|
+
continue
|
89
|
+
for prop, value in entity.itervalues():
|
90
|
+
if value == id and prop.reverse is not None:
|
91
|
+
yield prop.reverse, entity
|
92
|
+
|
93
|
+
def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
|
94
|
+
entity_ids: Set[str] = set()
|
95
|
+
for scope in self.dataset_names:
|
96
|
+
entity_ids.update(self.store.entities.get(scope, []))
|
97
|
+
for entity_id in entity_ids:
|
98
|
+
entity = self.get_entity(entity_id)
|
99
|
+
if entity is not None:
|
100
|
+
if include_schemata is not None and entity.schema not in include_schemata:
|
101
|
+
continue
|
102
|
+
yield entity
|
@@ -0,0 +1,131 @@
|
|
1
|
+
from redis.client import Redis, Pipeline
|
2
|
+
from typing import Generator, List, Optional, Set, Tuple
|
3
|
+
from followthemoney import DS, SE, Schema, registry, Property, Statement
|
4
|
+
|
5
|
+
from nomenklatura.kv import get_redis, close_redis, b
|
6
|
+
from nomenklatura.resolver import Linker
|
7
|
+
from nomenklatura.store.base import Store, View, Writer
|
8
|
+
from nomenklatura.store.util import pack_statement, unpack_statement
|
9
|
+
|
10
|
+
|
11
|
+
class RedisStore(Store[DS, SE]):
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
dataset: DS,
|
15
|
+
linker: Linker[SE],
|
16
|
+
db: Optional["Redis[bytes]"] = None,
|
17
|
+
):
|
18
|
+
super().__init__(dataset, linker)
|
19
|
+
if db is None:
|
20
|
+
db = get_redis()
|
21
|
+
self.db = db
|
22
|
+
|
23
|
+
def writer(self) -> Writer[DS, SE]:
|
24
|
+
return RedisWriter(self)
|
25
|
+
|
26
|
+
def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
|
27
|
+
return RedisView(self, scope, external=external)
|
28
|
+
|
29
|
+
def close(self) -> None:
|
30
|
+
close_redis()
|
31
|
+
|
32
|
+
|
33
|
+
class RedisWriter(Writer[DS, SE]):
|
34
|
+
BATCH_STATEMENTS = 100_000
|
35
|
+
|
36
|
+
def __init__(self, store: RedisStore[DS, SE]):
|
37
|
+
self.store: RedisStore[DS, SE] = store
|
38
|
+
self.pipeline: Optional["Pipeline[bytes]"] = None
|
39
|
+
self.batch_size = 0
|
40
|
+
|
41
|
+
def flush(self) -> None:
|
42
|
+
if self.pipeline is not None:
|
43
|
+
self.pipeline.execute()
|
44
|
+
self.pipeline = None
|
45
|
+
self.batch_size = 0
|
46
|
+
|
47
|
+
def add_statement(self, stmt: Statement) -> None:
|
48
|
+
if stmt.entity_id is None:
|
49
|
+
return
|
50
|
+
if self.batch_size >= self.BATCH_STATEMENTS:
|
51
|
+
self.flush()
|
52
|
+
if self.pipeline is None:
|
53
|
+
self.pipeline = self.store.db.pipeline()
|
54
|
+
canonical_id = self.store.linker.get_canonical(stmt.entity_id)
|
55
|
+
stmt.canonical_id = canonical_id
|
56
|
+
|
57
|
+
self.pipeline.sadd(b(f"ds:{stmt.dataset}"), b(canonical_id))
|
58
|
+
key = f"x:{canonical_id}" if stmt.external else f"s:{canonical_id}"
|
59
|
+
self.pipeline.sadd(b(key), pack_statement(stmt))
|
60
|
+
if stmt.prop_type == registry.entity.name:
|
61
|
+
vc = self.store.linker.get_canonical(stmt.value)
|
62
|
+
self.pipeline.sadd(b(f"i:{vc}"), b(canonical_id))
|
63
|
+
|
64
|
+
self.batch_size += 1
|
65
|
+
|
66
|
+
def pop(self, entity_id: str) -> List[Statement]:
|
67
|
+
if self.batch_size >= self.BATCH_STATEMENTS:
|
68
|
+
self.flush()
|
69
|
+
if self.pipeline is None:
|
70
|
+
self.pipeline = self.store.db.pipeline()
|
71
|
+
statements: List[Statement] = []
|
72
|
+
datasets: Set[str] = set()
|
73
|
+
keys = (f"s:{entity_id}", f"x:{entity_id}")
|
74
|
+
for v in self.store.db.sunion(keys):
|
75
|
+
stmt = unpack_statement(v, entity_id, False) # type: ignore
|
76
|
+
statements.append(stmt)
|
77
|
+
datasets.add(stmt.dataset)
|
78
|
+
|
79
|
+
if stmt.prop_type == registry.entity.name:
|
80
|
+
vc = self.store.linker.get_canonical(stmt.value)
|
81
|
+
self.pipeline.srem(b(f"i:{vc}"), b(entity_id))
|
82
|
+
|
83
|
+
for dataset in datasets:
|
84
|
+
self.pipeline.srem(b(f"ds:{dataset}"), b(entity_id))
|
85
|
+
|
86
|
+
return list(statements)
|
87
|
+
|
88
|
+
|
89
|
+
class RedisView(View[DS, SE]):
|
90
|
+
def __init__(
|
91
|
+
self, store: RedisStore[DS, SE], scope: DS, external: bool = False
|
92
|
+
) -> None:
|
93
|
+
super().__init__(store, scope, external=external)
|
94
|
+
self.store: RedisStore[DS, SE] = store
|
95
|
+
|
96
|
+
def has_entity(self, id: str) -> bool:
|
97
|
+
keys = [b(f"s:{id}")]
|
98
|
+
if self.external:
|
99
|
+
keys.append(b(f"x:{id}"))
|
100
|
+
return self.store.db.exists(*keys) > 0
|
101
|
+
|
102
|
+
def get_entity(self, id: str) -> Optional[SE]:
|
103
|
+
statements: List[Statement] = []
|
104
|
+
keys = [b(f"s:{id}")]
|
105
|
+
if self.external:
|
106
|
+
keys.append(b(f"x:{id}"))
|
107
|
+
for v in self.store.db.sunion(keys):
|
108
|
+
statements.append(unpack_statement(v, id, False)) # type: ignore
|
109
|
+
return self.store.assemble(statements)
|
110
|
+
|
111
|
+
def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
|
112
|
+
for v in self.store.db.smembers(b(f"i:{id}")):
|
113
|
+
entity = self.get_entity(v.decode("utf-8"))
|
114
|
+
if entity is None:
|
115
|
+
continue
|
116
|
+
for prop, value in entity.itervalues():
|
117
|
+
if value == id and prop.reverse is not None:
|
118
|
+
yield prop.reverse, entity
|
119
|
+
|
120
|
+
def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
|
121
|
+
scope_name = b(f"ds:{self.scope.name}")
|
122
|
+
if self.scope.is_collection:
|
123
|
+
parts = [b(f"ds:{d}") for d in self.scope.leaf_names]
|
124
|
+
self.store.db.sunionstore(scope_name, parts)
|
125
|
+
|
126
|
+
for id in self.store.db.sscan_iter(scope_name):
|
127
|
+
entity = self.get_entity(id.decode("utf-8"))
|
128
|
+
if entity is not None:
|
129
|
+
if include_schemata is not None and entity.schema not in include_schemata:
|
130
|
+
continue
|
131
|
+
yield entity
|