nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,130 @@
1
+ from types import TracebackType
2
+ from typing import Optional, Generator, List, Tuple, Generic, Type, cast
3
+ from followthemoney import Schema, registry, Property, DS, Statement
4
+ from followthemoney import StatementEntity, SE
5
+ from followthemoney.statement.util import get_prop_type
6
+
7
+ from nomenklatura.resolver import Linker, StrIdent
8
+
9
+
10
+ class Store(Generic[DS, SE]):
11
+ """A data storage and retrieval mechanism for statement-based entity data.
12
+ Essentially, this is a triple store which can be implemented using various
13
+ backends."""
14
+
15
+ def __init__(self, dataset: DS, linker: Linker[SE]):
16
+ self.dataset = dataset
17
+ self.linker = linker
18
+ self.entity_class = cast(Type[SE], StatementEntity)
19
+
20
+ def writer(self) -> "Writer[DS, SE]":
21
+ raise NotImplementedError()
22
+
23
+ def view(self, scope: DS, external: bool = False) -> "View[DS, SE]":
24
+ raise NotImplementedError()
25
+
26
+ def default_view(self, external: bool = False) -> "View[DS, SE]":
27
+ return self.view(self.dataset, external=external)
28
+
29
+ def assemble(self, statements: List[Statement]) -> Optional[SE]:
30
+ if not len(statements):
31
+ return None
32
+ for stmt in statements:
33
+ if get_prop_type(stmt.schema, stmt.prop) == registry.entity.name:
34
+ stmt.value = self.linker.get_canonical(stmt.value)
35
+ entity = self.entity_class.from_statements(self.dataset, statements)
36
+ if entity.id is not None:
37
+ entity.extra_referents.update(self.linker.get_referents(entity.id))
38
+ return entity
39
+
40
+ def update(self, id: StrIdent) -> None:
41
+ canonical_id = self.linker.get_canonical(id)
42
+ with self.writer() as writer:
43
+ for referent in self.linker.get_referents(canonical_id):
44
+ for stmt in writer.pop(referent):
45
+ stmt.canonical_id = canonical_id
46
+ writer.add_statement(stmt)
47
+
48
+ def close(self) -> None:
49
+ pass
50
+
51
+ def __repr__(self) -> str:
52
+ return f"<{type(self).__name__}({self.dataset.name!r})>"
53
+
54
+
55
+ class Writer(Generic[DS, SE]):
56
+ """Bulk writing operations."""
57
+
58
+ def __init__(self, store: Store[DS, SE]):
59
+ self.store = store
60
+
61
+ def add_statement(self, stmt: Statement) -> None:
62
+ raise NotImplementedError()
63
+
64
+ def add_entity(self, entity: SE) -> None:
65
+ for stmt in entity.statements:
66
+ self.add_statement(stmt)
67
+
68
+ def pop(self, entity_id: str) -> List[Statement]:
69
+ raise NotImplementedError()
70
+
71
+ def flush(self) -> None:
72
+ pass
73
+
74
+ def close(self) -> None:
75
+ self.store.close()
76
+
77
+ def __enter__(self) -> "Writer[DS, SE]":
78
+ return self
79
+
80
+ def __exit__(
81
+ self,
82
+ type: Optional[Type[BaseException]],
83
+ value: Optional[BaseException],
84
+ traceback: Optional[TracebackType],
85
+ ) -> None:
86
+ self.flush()
87
+
88
+ def __repr__(self) -> str:
89
+ return f"<{type(self).__name__}({self.store!r})>"
90
+
91
+
92
+ class View(Generic[DS, SE]):
93
+ def __init__(self, store: Store[DS, SE], scope: DS, external: bool = False):
94
+ self.store = store
95
+ self.scope = scope
96
+ self.dataset_names = scope.leaf_names
97
+ self.external = external
98
+
99
+ def has_entity(self, id: str) -> bool:
100
+ raise NotImplementedError()
101
+
102
+ def get_entity(self, id: str) -> Optional[SE]:
103
+ raise NotImplementedError()
104
+
105
+ def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
106
+ raise NotImplementedError()
107
+
108
+ def get_adjacent(
109
+ self, entity: SE, inverted: bool = True
110
+ ) -> Generator[Tuple[Property, SE], None, None]:
111
+ for prop, value in entity.itervalues():
112
+ if prop.type == registry.entity:
113
+ child = self.get_entity(value)
114
+ if child is not None:
115
+ yield prop, child
116
+
117
+ if inverted and entity.id is not None:
118
+ for prop, adjacent in self.get_inverted(entity.id):
119
+ yield prop, adjacent
120
+
121
+ def entities(self, include_schemata: List[Schema] = []) -> Generator[SE, None, None]:
122
+ """Iterate over all entities in the view.
123
+
124
+ If `include_schemata` is provided, only entities of the provided schemata will be returned.
125
+ Note that `schemata` will not be expanded via "is_a" relationships."""
126
+
127
+ raise NotImplementedError()
128
+
129
+ def __repr__(self) -> str:
130
+ return f"<{type(self).__name__}({self.scope.name!r})>"
@@ -0,0 +1,272 @@
1
+ #
2
+ # LevelDB-based store for Nomenklatura.
3
+ # A lot of the code in this module is extremely performance-sensitive, so it is unrolled and
4
+ # doesn't use helper functions in some places where it would otherwise be more readable.
5
+ #
6
+ # Specific examples:
7
+ # * Not calling a helper to byte-encode values.
8
+ # * Not having a helper method for building entities.
9
+ import gc
10
+ import orjson
11
+ import logging
12
+ from pathlib import Path
13
+ from typing import Any, Generator, List, Optional, Set, Tuple
14
+ from rigour.env import ENCODING as E
15
+
16
+ import plyvel # type: ignore
17
+ from followthemoney import model, DS, SE, Schema, registry, Property, Statement
18
+ from followthemoney.exc import InvalidData
19
+ from followthemoney.statement.util import get_prop_type
20
+
21
+ from nomenklatura.resolver import Linker
22
+ from nomenklatura.store.base import Store, View, Writer
23
+
24
+ log = logging.getLogger(__name__)
25
+ MAX_OPEN_FILES = 1000
26
+
27
+
28
+ def unpack_statement(
29
+ keys: List[str],
30
+ data: bytes,
31
+ ) -> Statement:
32
+ _, canonical_id, ext, dataset, schema, stmt_id = keys
33
+ (
34
+ entity_id,
35
+ prop,
36
+ value,
37
+ lang,
38
+ original_value,
39
+ origin,
40
+ first_seen,
41
+ last_seen,
42
+ ) = orjson.loads(data)
43
+ return Statement(
44
+ id=stmt_id,
45
+ entity_id=entity_id,
46
+ prop=prop,
47
+ schema=schema,
48
+ value=value,
49
+ lang=None if lang == 0 else lang,
50
+ dataset=dataset,
51
+ original_value=None if original_value == 0 else original_value,
52
+ origin=None if origin == 0 else origin,
53
+ first_seen=first_seen,
54
+ last_seen=last_seen,
55
+ canonical_id=canonical_id,
56
+ external=ext == "x",
57
+ )
58
+
59
+
60
+ class LevelDBStore(Store[DS, SE]):
61
+ def __init__(self, dataset: DS, linker: Linker[SE], path: Path):
62
+ super().__init__(dataset, linker)
63
+ self.path = path
64
+ self.db = plyvel.DB(
65
+ path.as_posix(),
66
+ create_if_missing=True,
67
+ max_open_files=MAX_OPEN_FILES,
68
+ )
69
+
70
+ def optimize(self) -> None:
71
+ """Optimize the database by compacting it."""
72
+ self.db.compact_range()
73
+ self.db.close()
74
+ gc.collect()
75
+ self.db = plyvel.DB(
76
+ self.path.as_posix(),
77
+ create_if_missing=False,
78
+ max_open_files=MAX_OPEN_FILES,
79
+ )
80
+ log.info("Optimized LevelDB at %s", self.path)
81
+
82
+ def writer(self) -> Writer[DS, SE]:
83
+ return LevelDBWriter(self)
84
+
85
+ def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
86
+ return LevelDBView(self, scope, external=external)
87
+
88
+ def close(self) -> None:
89
+ self.db.close()
90
+
91
+
92
+ class LevelDBWriter(Writer[DS, SE]):
93
+ BATCH_STATEMENTS = 100_000
94
+
95
+ def __init__(self, store: LevelDBStore[DS, SE]):
96
+ self.store: LevelDBStore[DS, SE] = store
97
+ self.batch: Optional[Any] = None
98
+ self.batch_size = 0
99
+
100
+ def flush(self) -> None:
101
+ if self.batch is not None:
102
+ self.batch.write()
103
+ self.batch = None
104
+ self.batch_size = 0
105
+
106
+ def add_statement(self, stmt: Statement) -> None:
107
+ if stmt.entity_id is None:
108
+ return
109
+ if self.batch_size >= self.BATCH_STATEMENTS:
110
+ self.flush()
111
+ if self.batch is None:
112
+ self.batch = self.store.db.write_batch()
113
+ canonical_id = self.store.linker.get_canonical(stmt.entity_id)
114
+ stmt.canonical_id = canonical_id
115
+
116
+ ext = "x" if stmt.external else ""
117
+ key = f"s:{canonical_id}:{ext}:{stmt.dataset}:{stmt.schema}:{stmt.id}".encode(E)
118
+ values = (
119
+ stmt.entity_id,
120
+ stmt.prop,
121
+ stmt.value,
122
+ stmt.lang or 0,
123
+ stmt.original_value or 0,
124
+ stmt.origin or 0,
125
+ stmt.first_seen,
126
+ stmt.last_seen,
127
+ )
128
+ data = orjson.dumps(values)
129
+ self.batch.put(key, data)
130
+ if get_prop_type(stmt.schema, stmt.prop) == registry.entity.name:
131
+ vc = self.store.linker.get_canonical(stmt.value)
132
+ key = f"i:{vc}:{stmt.canonical_id}".encode(E)
133
+ self.batch.put(key, b"")
134
+
135
+ self.batch_size += 1
136
+
137
+ def pop(self, entity_id: str) -> List[Statement]:
138
+ if self.batch_size >= self.BATCH_STATEMENTS:
139
+ self.flush()
140
+ if self.batch is None:
141
+ self.batch = self.store.db.write_batch()
142
+ statements: List[Statement] = []
143
+ datasets: Set[str] = set()
144
+ prefix = f"s:{entity_id}:".encode(E)
145
+ with self.store.db.iterator(prefix=prefix) as it:
146
+ for k, v in it:
147
+ self.batch.delete(k)
148
+ stmt = unpack_statement(k.decode(E).split(":"), v)
149
+ statements.append(stmt)
150
+ datasets.add(stmt.dataset)
151
+
152
+ if stmt.prop_type == registry.entity.name:
153
+ vc = self.store.linker.get_canonical(stmt.value)
154
+ self.batch.delete(f"i:{vc}:{entity_id}".encode(E))
155
+ return list(statements)
156
+
157
+
158
+ class LevelDBView(View[DS, SE]):
159
+ def __init__(
160
+ self, store: LevelDBStore[DS, SE], scope: DS, external: bool = False
161
+ ) -> None:
162
+ super().__init__(store, scope, external=external)
163
+ self.store: LevelDBStore[DS, SE] = store
164
+ self.dataset_names: Set[str] = set(scope.dataset_names)
165
+
166
+ def has_entity(self, id: str) -> bool:
167
+ prefix = f"s:{id}:".encode(E)
168
+ with self.store.db.iterator(prefix=prefix, include_value=False) as it:
169
+ for v in it:
170
+ _, _, ext, dataset, _, _ = v.decode(E).split(":")
171
+ if dataset not in self.dataset_names:
172
+ continue
173
+ if ext == "x" and not self.external:
174
+ continue
175
+ return True
176
+ return False
177
+
178
+ def get_entity(self, id: str) -> Optional[SE]:
179
+ statements: List[Statement] = []
180
+ prefix = f"s:{id}:".encode(E)
181
+ with self.store.db.iterator(prefix=prefix) as it:
182
+ for k, v in it:
183
+ keys = k.decode(E).split(":")
184
+ _, _, ext, dataset, _, _ = keys
185
+ if dataset not in self.dataset_names:
186
+ continue
187
+ if ext == "x" and not self.external:
188
+ continue
189
+ statements.append(unpack_statement(keys, v))
190
+ return self.store.assemble(statements)
191
+
192
+ def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
193
+ prefix = f"i:{id}:".encode(E)
194
+ with self.store.db.iterator(prefix=prefix, include_value=False) as it:
195
+ for k in it:
196
+ _, _, ref = k.decode(E).split(":")
197
+ entity = self.get_entity(ref)
198
+ if entity is None:
199
+ continue
200
+ for prop, value in entity.itervalues():
201
+ if value == id and prop.reverse is not None:
202
+ yield prop.reverse, entity
203
+
204
+ def entities(
205
+ self, include_schemata: Optional[List[Schema]] = None
206
+ ) -> Generator[SE, None, None]:
207
+ with self.store.db.iterator(prefix=b"s:", fill_cache=False) as it:
208
+ current_id: Optional[str] = None
209
+ current_schema: Optional[Schema] = None
210
+ current_fail: bool = False
211
+ statements: List[Statement] = []
212
+ for k, v in it:
213
+ keys = k.decode(E).split(":")
214
+ _, canonical_id, ext, dataset, schema, _ = keys
215
+
216
+ if ext == "x" and not self.external:
217
+ continue
218
+ if dataset not in self.dataset_names:
219
+ continue
220
+
221
+ # If we're seeing a new canonical ID, yield the previous entity
222
+ if canonical_id != current_id:
223
+ if (
224
+ include_schemata is not None
225
+ and current_schema not in include_schemata
226
+ ):
227
+ statements = []
228
+ if len(statements) > 0 and not current_fail:
229
+ entity = self.store.assemble(statements)
230
+ if entity is not None:
231
+ yield entity
232
+ current_id = canonical_id
233
+ current_schema = None
234
+ current_fail = False
235
+ statements = []
236
+
237
+ # If we're not filtering on schemata, we can skip the expensive-ish schema building here
238
+ # The checking is done by store.assemble() anyway
239
+ if include_schemata is not None:
240
+ if current_schema is None:
241
+ current_schema = model.get(schema)
242
+ # If the statement is of an unknown schema
243
+ if current_schema is None:
244
+ log.error("Unknown schema %r: %s", (schema, current_id))
245
+ # Mark the entity as failed, but we need to iterate through the rest of the statements
246
+ current_fail = True
247
+ continue
248
+ # If the schema of the statement does not exactly match the schema of the current entity,
249
+ # find a common parent schema.
250
+ elif current_schema.name != schema:
251
+ try:
252
+ current_schema = model.common_schema(current_schema, schema)
253
+ except InvalidData as inv:
254
+ msg = "Invalid schema %s for %r: %s" % (
255
+ schema,
256
+ current_id,
257
+ inv,
258
+ )
259
+ log.error(msg)
260
+ # Mark the entity as failed, but we need to iterate through the rest of the statements
261
+ current_fail = True
262
+ continue
263
+
264
+ statements.append(unpack_statement(keys, v))
265
+
266
+ # Handle the last entity at the end of the iterator
267
+ if include_schemata is not None and current_schema not in include_schemata:
268
+ statements = []
269
+ if len(statements) > 0 and not current_fail:
270
+ entity = self.store.assemble(statements)
271
+ if entity is not None:
272
+ yield entity
@@ -0,0 +1,102 @@
1
+ from typing import Dict, Set, List, Optional, Generator, Tuple
2
+ from followthemoney import DS, SE, Schema, registry, Property, Statement
3
+
4
+ from nomenklatura.store.base import Store, View, Writer
5
+ from nomenklatura.resolver import Linker
6
+
7
+
8
+ class MemoryStore(Store[DS, SE]):
9
+ def __init__(self, dataset: DS, linker: Linker[SE]):
10
+ super().__init__(dataset, linker)
11
+ self.stmts: Dict[str, Set[Statement]] = {}
12
+ self.inverted: Dict[str, Set[str]] = {}
13
+ self.entities: Dict[str, Set[str]] = {}
14
+
15
+ def writer(self) -> Writer[DS, SE]:
16
+ return MemoryWriter(self)
17
+
18
+ def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
19
+ return MemoryView(self, scope, external=external)
20
+
21
+
22
+ class MemoryWriter(Writer[DS, SE]):
23
+ def __init__(self, store: MemoryStore[DS, SE]):
24
+ self.store: MemoryStore[DS, SE] = store
25
+
26
+ def add_statement(self, stmt: Statement) -> None:
27
+ if stmt.entity_id is None:
28
+ return
29
+ canonical_id = stmt.canonical_id or self.store.linker.get_canonical(
30
+ stmt.entity_id
31
+ )
32
+ if canonical_id not in self.store.stmts:
33
+ self.store.stmts[canonical_id] = set()
34
+ self.store.stmts[canonical_id].add(stmt)
35
+
36
+ if stmt.dataset not in self.store.entities:
37
+ self.store.entities[stmt.dataset] = set()
38
+ self.store.entities[stmt.dataset].add(canonical_id)
39
+
40
+ if stmt.prop_type == registry.entity.name:
41
+ inverted_id = self.store.linker.get_canonical(stmt.value)
42
+ if inverted_id not in self.store.inverted:
43
+ self.store.inverted[inverted_id] = set()
44
+ self.store.inverted[inverted_id].add(canonical_id)
45
+
46
+ def pop(self, entity_id: str) -> List[Statement]:
47
+ statements = self.store.stmts.pop(entity_id, set())
48
+ for stmt in statements:
49
+ if stmt.dataset in self.store.entities:
50
+ self.store.entities[stmt.dataset].discard(entity_id)
51
+
52
+ if stmt.prop_type == registry.entity.name:
53
+ inverted_id = self.store.linker.get_canonical(stmt.value)
54
+ if inverted_id in self.store.inverted:
55
+ self.store.inverted[inverted_id].discard(entity_id)
56
+
57
+ return list(statements)
58
+
59
+
60
+ class MemoryView(View[DS, SE]):
61
+ def __init__(
62
+ self, store: MemoryStore[DS, SE], scope: DS, external: bool = False
63
+ ) -> None:
64
+ super().__init__(store, scope, external=external)
65
+ self.store: MemoryStore[DS, SE] = store
66
+
67
+ def has_entity(self, id: str) -> bool:
68
+ for stmt in self.store.stmts.get(id, []):
69
+ if self.external is False and stmt.external:
70
+ continue
71
+ return True
72
+ return False
73
+
74
+ def get_entity(self, id: str) -> Optional[SE]:
75
+ if id not in self.store.stmts:
76
+ return None
77
+ stmts: List[Statement] = []
78
+ for stmt in self.store.stmts[id]:
79
+ if self.external is False and stmt.external:
80
+ continue
81
+ stmts.append(stmt)
82
+ return self.store.assemble(stmts)
83
+
84
+ def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
85
+ for inverted_id in self.store.inverted.get(id, []):
86
+ entity = self.get_entity(inverted_id)
87
+ if entity is None:
88
+ continue
89
+ for prop, value in entity.itervalues():
90
+ if value == id and prop.reverse is not None:
91
+ yield prop.reverse, entity
92
+
93
+ def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
94
+ entity_ids: Set[str] = set()
95
+ for scope in self.dataset_names:
96
+ entity_ids.update(self.store.entities.get(scope, []))
97
+ for entity_id in entity_ids:
98
+ entity = self.get_entity(entity_id)
99
+ if entity is not None:
100
+ if include_schemata is not None and entity.schema not in include_schemata:
101
+ continue
102
+ yield entity
@@ -0,0 +1,131 @@
1
+ from redis.client import Redis, Pipeline
2
+ from typing import Generator, List, Optional, Set, Tuple
3
+ from followthemoney import DS, SE, Schema, registry, Property, Statement
4
+
5
+ from nomenklatura.kv import get_redis, close_redis, b
6
+ from nomenklatura.resolver import Linker
7
+ from nomenklatura.store.base import Store, View, Writer
8
+ from nomenklatura.store.util import pack_statement, unpack_statement
9
+
10
+
11
+ class RedisStore(Store[DS, SE]):
12
+ def __init__(
13
+ self,
14
+ dataset: DS,
15
+ linker: Linker[SE],
16
+ db: Optional["Redis[bytes]"] = None,
17
+ ):
18
+ super().__init__(dataset, linker)
19
+ if db is None:
20
+ db = get_redis()
21
+ self.db = db
22
+
23
+ def writer(self) -> Writer[DS, SE]:
24
+ return RedisWriter(self)
25
+
26
+ def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
27
+ return RedisView(self, scope, external=external)
28
+
29
+ def close(self) -> None:
30
+ close_redis()
31
+
32
+
33
+ class RedisWriter(Writer[DS, SE]):
34
+ BATCH_STATEMENTS = 100_000
35
+
36
+ def __init__(self, store: RedisStore[DS, SE]):
37
+ self.store: RedisStore[DS, SE] = store
38
+ self.pipeline: Optional["Pipeline[bytes]"] = None
39
+ self.batch_size = 0
40
+
41
+ def flush(self) -> None:
42
+ if self.pipeline is not None:
43
+ self.pipeline.execute()
44
+ self.pipeline = None
45
+ self.batch_size = 0
46
+
47
+ def add_statement(self, stmt: Statement) -> None:
48
+ if stmt.entity_id is None:
49
+ return
50
+ if self.batch_size >= self.BATCH_STATEMENTS:
51
+ self.flush()
52
+ if self.pipeline is None:
53
+ self.pipeline = self.store.db.pipeline()
54
+ canonical_id = self.store.linker.get_canonical(stmt.entity_id)
55
+ stmt.canonical_id = canonical_id
56
+
57
+ self.pipeline.sadd(b(f"ds:{stmt.dataset}"), b(canonical_id))
58
+ key = f"x:{canonical_id}" if stmt.external else f"s:{canonical_id}"
59
+ self.pipeline.sadd(b(key), pack_statement(stmt))
60
+ if stmt.prop_type == registry.entity.name:
61
+ vc = self.store.linker.get_canonical(stmt.value)
62
+ self.pipeline.sadd(b(f"i:{vc}"), b(canonical_id))
63
+
64
+ self.batch_size += 1
65
+
66
+ def pop(self, entity_id: str) -> List[Statement]:
67
+ if self.batch_size >= self.BATCH_STATEMENTS:
68
+ self.flush()
69
+ if self.pipeline is None:
70
+ self.pipeline = self.store.db.pipeline()
71
+ statements: List[Statement] = []
72
+ datasets: Set[str] = set()
73
+ keys = (f"s:{entity_id}", f"x:{entity_id}")
74
+ for v in self.store.db.sunion(keys):
75
+ stmt = unpack_statement(v, entity_id, False) # type: ignore
76
+ statements.append(stmt)
77
+ datasets.add(stmt.dataset)
78
+
79
+ if stmt.prop_type == registry.entity.name:
80
+ vc = self.store.linker.get_canonical(stmt.value)
81
+ self.pipeline.srem(b(f"i:{vc}"), b(entity_id))
82
+
83
+ for dataset in datasets:
84
+ self.pipeline.srem(b(f"ds:{dataset}"), b(entity_id))
85
+
86
+ return list(statements)
87
+
88
+
89
+ class RedisView(View[DS, SE]):
90
+ def __init__(
91
+ self, store: RedisStore[DS, SE], scope: DS, external: bool = False
92
+ ) -> None:
93
+ super().__init__(store, scope, external=external)
94
+ self.store: RedisStore[DS, SE] = store
95
+
96
+ def has_entity(self, id: str) -> bool:
97
+ keys = [b(f"s:{id}")]
98
+ if self.external:
99
+ keys.append(b(f"x:{id}"))
100
+ return self.store.db.exists(*keys) > 0
101
+
102
+ def get_entity(self, id: str) -> Optional[SE]:
103
+ statements: List[Statement] = []
104
+ keys = [b(f"s:{id}")]
105
+ if self.external:
106
+ keys.append(b(f"x:{id}"))
107
+ for v in self.store.db.sunion(keys):
108
+ statements.append(unpack_statement(v, id, False)) # type: ignore
109
+ return self.store.assemble(statements)
110
+
111
+ def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
112
+ for v in self.store.db.smembers(b(f"i:{id}")):
113
+ entity = self.get_entity(v.decode("utf-8"))
114
+ if entity is None:
115
+ continue
116
+ for prop, value in entity.itervalues():
117
+ if value == id and prop.reverse is not None:
118
+ yield prop.reverse, entity
119
+
120
+ def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
121
+ scope_name = b(f"ds:{self.scope.name}")
122
+ if self.scope.is_collection:
123
+ parts = [b(f"ds:{d}") for d in self.scope.leaf_names]
124
+ self.store.db.sunionstore(scope_name, parts)
125
+
126
+ for id in self.store.db.sscan_iter(scope_name):
127
+ entity = self.get_entity(id.decode("utf-8"))
128
+ if entity is not None:
129
+ if include_schemata is not None and entity.schema not in include_schemata:
130
+ continue
131
+ yield entity