nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,219 @@
1
+ from typing import Any, Generator, List, Optional, Set, Tuple
2
+
3
+ from followthemoney import DS, SE, Property, Schema, Statement
4
+ from sqlalchemy import Table, delete, func, select
5
+ from sqlalchemy.engine import Engine, Transaction, create_engine
6
+ from sqlalchemy.dialects.postgresql import insert as psql_insert
7
+ from sqlalchemy.dialects.sqlite import insert as sqlite_insert
8
+ from sqlalchemy.sql.selectable import Select
9
+
10
+ from nomenklatura import settings
11
+ from nomenklatura.db import get_metadata
12
+ from nomenklatura.resolver import Linker, Identifier
13
+ from nomenklatura.db import make_statement_table
14
+ from nomenklatura.store import Store, View, Writer
15
+
16
+
17
+ class SQLStore(Store[DS, SE]):
18
+ def __init__(
19
+ self,
20
+ dataset: DS,
21
+ linker: Linker[SE],
22
+ uri: str = settings.DB_URL,
23
+ **engine_kwargs: Any,
24
+ ):
25
+ super().__init__(dataset, linker)
26
+ if "pool_size" not in engine_kwargs:
27
+ engine_kwargs["pool_size"] = settings.DB_POOL_SIZE
28
+ # if uri.lower().startswith("sqlite"):
29
+ # engine_kwargs.pop("pool_size", None)
30
+ metadata = get_metadata()
31
+ self.engine: Engine = create_engine(uri, **engine_kwargs)
32
+ self.table = make_statement_table(metadata)
33
+ metadata.create_all(self.engine, tables=[self.table], checkfirst=True)
34
+
35
+ def writer(self) -> Writer[DS, SE]:
36
+ return SQLWriter(self)
37
+
38
+ def view(self, scope: DS, external: bool = False) -> View[DS, SE]:
39
+ return SQLView(self, scope, external=external)
40
+
41
+ def _execute(
42
+ self, q: Select[Any], stream: bool = True
43
+ ) -> Generator[Any, None, None]:
44
+ # execute any read query against sql backend
45
+ with self.engine.connect() as conn:
46
+ if stream:
47
+ conn = conn.execution_options(stream_results=True)
48
+ cursor = conn.execute(q)
49
+ while rows := cursor.fetchmany(10_000):
50
+ yield from rows
51
+
52
+ def _iterate_stmts(
53
+ self, q: Select[Any], stream: bool = True
54
+ ) -> Generator[Statement, None, None]:
55
+ for row in self._execute(q, stream=stream):
56
+ yield Statement.from_db_row(row)
57
+
58
+ def _iterate(
59
+ self, q: Select[Any], stream: bool = True
60
+ ) -> Generator[SE, None, None]:
61
+ current_id = None
62
+ current_stmts: list[Statement] = []
63
+ for stmt in self._iterate_stmts(q, stream=stream):
64
+ entity_id = stmt.entity_id
65
+ if current_id is None:
66
+ current_id = entity_id
67
+ if current_id != entity_id:
68
+ proxy = self.assemble(current_stmts)
69
+ if proxy is not None:
70
+ yield proxy
71
+ current_id = entity_id
72
+ current_stmts = []
73
+ current_stmts.append(stmt)
74
+ if len(current_stmts):
75
+ proxy = self.assemble(current_stmts)
76
+ if proxy is not None:
77
+ yield proxy
78
+
79
+
80
+ class SQLWriter(Writer[DS, SE]):
81
+ BATCH_STATEMENTS = 10_000
82
+
83
+ def __init__(self, store: SQLStore[DS, SE]):
84
+ self.store: SQLStore[DS, SE] = store
85
+ self.batch: Set[Statement] = set()
86
+ self.conn = self.store.engine.connect()
87
+ self.tx: Optional[Transaction] = None
88
+
89
+ def _upsert_batch(self) -> None:
90
+ if not len(self.batch):
91
+ return
92
+ values = [s.to_db_row() for s in self.batch]
93
+ if self.tx is None:
94
+ self.tx = self.conn.begin()
95
+ if self.store.engine.dialect.name == "sqlite":
96
+ ilstmt = sqlite_insert(self.store.table).values(values)
97
+ lstmt = ilstmt.on_conflict_do_update(
98
+ index_elements=["id"],
99
+ set_=dict(
100
+ canonical_id=ilstmt.excluded.canonical_id,
101
+ schema=ilstmt.excluded.schema,
102
+ prop_type=ilstmt.excluded.prop_type,
103
+ lang=ilstmt.excluded.lang,
104
+ original_value=ilstmt.excluded.original_value,
105
+ last_seen=ilstmt.excluded.last_seen,
106
+ ),
107
+ )
108
+ self.conn.execute(lstmt)
109
+ elif self.store.engine.dialect.name in ("postgresql", "postgres"):
110
+ ipstmt = psql_insert(self.store.table).values(values)
111
+ pstmt = ipstmt.on_conflict_do_update(
112
+ index_elements=["id"],
113
+ set_=dict(
114
+ canonical_id=ipstmt.excluded.canonical_id,
115
+ schema=ipstmt.excluded.schema,
116
+ prop_type=ipstmt.excluded.prop_type,
117
+ lang=ipstmt.excluded.lang,
118
+ original_value=ipstmt.excluded.original_value,
119
+ last_seen=ipstmt.excluded.last_seen,
120
+ ),
121
+ )
122
+ self.conn.execute(pstmt)
123
+ else:
124
+ msg = f"Upsert not implemented for dialect {self.store.engine.dialect.name}"
125
+ raise NotImplementedError(msg)
126
+ self.batch = set()
127
+
128
+ def flush(self) -> None:
129
+ if len(self.batch):
130
+ self._upsert_batch()
131
+ if self.tx is not None:
132
+ self.tx.commit()
133
+ self.tx = None
134
+
135
+ def add_statement(self, stmt: Statement) -> None:
136
+ if stmt.entity_id is None:
137
+ return
138
+ canonical_id = self.store.linker.get_canonical(stmt.entity_id)
139
+ stmt.canonical_id = canonical_id
140
+ self.batch.add(stmt)
141
+ if len(self.batch) >= self.BATCH_STATEMENTS:
142
+ self._upsert_batch()
143
+
144
+ def pop(self, entity_id: str) -> List[Statement]:
145
+ if self.tx is None:
146
+ self.tx = self.conn.begin()
147
+
148
+ table = self.store.table
149
+ q = select(table)
150
+ q = q.where(table.c.canonical_id == entity_id)
151
+ statements: List[Statement] = []
152
+ cursor = self.conn.execute(q)
153
+ for row in cursor.fetchall():
154
+ statements.append(Statement.from_db_row(row))
155
+
156
+ q_delete = delete(table)
157
+ q_delete = q_delete.where(table.c.canonical_id == entity_id)
158
+ self.conn.execute(q_delete)
159
+ return statements
160
+
161
+
162
+ class SQLView(View[DS, SE]):
163
+ def __init__(
164
+ self, store: SQLStore[DS, SE], scope: DS, external: bool = False
165
+ ) -> None:
166
+ super().__init__(store, scope, external=external)
167
+ self.store: SQLStore[DS, SE] = store
168
+
169
+ def get_entity(self, id: str) -> Optional[SE]:
170
+ table = self.store.table
171
+ q = select(table)
172
+ q = q.where(table.c.canonical_id == id)
173
+ q = q.where(table.c.dataset.in_(self.dataset_names))
174
+ for proxy in self.store._iterate(q, stream=False):
175
+ return proxy
176
+ return None
177
+
178
+ def has_entity(self, id: str) -> bool:
179
+ table = self.store.table
180
+ q = select(func.count(table.c.id))
181
+ q = q.where(table.c.canonical_id == id)
182
+ q = q.where(table.c.dataset.in_(self.dataset_names))
183
+ with self.store.engine.connect() as conn:
184
+ cursor = conn.execute(q)
185
+ count = cursor.scalar()
186
+ if count is not None and count > 0:
187
+ return True
188
+ else:
189
+ return False
190
+
191
+ def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
192
+ table = self.store.table
193
+ id_ = Identifier.get(id)
194
+ ids = [i.id for i in self.store.linker.connected(id_)]
195
+ q = select(table.c.canonical_id)
196
+ q = q.where(table.c.prop_type == "entity")
197
+ q = q.where(table.c.value.in_(ids))
198
+ q = q.where(table.c.dataset.in_(self.dataset_names))
199
+ q = q.group_by(table.c.canonical_id)
200
+ with self.store.engine.connect() as conn:
201
+ cursor = conn.execute(q)
202
+ for (canonical_id,) in cursor.fetchall():
203
+ if canonical_id is None:
204
+ continue
205
+ entity = self.get_entity(canonical_id)
206
+ if entity is not None:
207
+ for prop, value in entity.itervalues():
208
+ if value == id and prop.reverse is not None:
209
+ yield prop.reverse, entity
210
+
211
+ def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
212
+ table: Table = self.store.table
213
+ q = select(table)
214
+ q = q.where(table.c.dataset.in_(self.dataset_names))
215
+ q = q.order_by(table.c.canonical_id)
216
+ for entity in self.store._iterate(q, stream=True):
217
+ if include_schemata is not None and entity.schema not in include_schemata:
218
+ continue
219
+ yield entity
@@ -0,0 +1,48 @@
1
+ import orjson
2
+
3
+ from followthemoney import Statement
4
+ from followthemoney.statement.util import pack_prop, unpack_prop
5
+
6
+
7
+ def pack_statement(stmt: Statement) -> bytes:
8
+ values = (
9
+ stmt.id,
10
+ stmt.entity_id,
11
+ stmt.dataset,
12
+ pack_prop(stmt.schema, stmt.prop),
13
+ stmt.value,
14
+ stmt.lang,
15
+ stmt.original_value,
16
+ stmt.first_seen,
17
+ stmt.last_seen,
18
+ )
19
+ return orjson.dumps(values)
20
+
21
+
22
+ def unpack_statement(data: bytes, canonical_id: str, external: bool) -> Statement:
23
+ (
24
+ id,
25
+ entity_id,
26
+ dataset,
27
+ prop_id,
28
+ value,
29
+ lang,
30
+ original_value,
31
+ first_seen,
32
+ last_seen,
33
+ ) = orjson.loads(data)
34
+ schema, _, prop = unpack_prop(prop_id)
35
+ return Statement(
36
+ id=id,
37
+ entity_id=entity_id,
38
+ prop=prop,
39
+ schema=schema,
40
+ value=value,
41
+ lang=lang,
42
+ dataset=dataset,
43
+ original_value=original_value,
44
+ first_seen=first_seen,
45
+ last_seen=last_seen,
46
+ canonical_id=canonical_id,
47
+ external=external,
48
+ )
@@ -0,0 +1,371 @@
1
+ import orjson
2
+ import logging
3
+ from redis.client import Redis
4
+ from typing import Generator, List, Optional, Set, Tuple, Dict
5
+ from followthemoney import DS, SE, Schema, registry, Property, Statement
6
+ from followthemoney.statement.util import pack_prop, unpack_prop
7
+
8
+ from nomenklatura.kv import b, bv, get_redis, close_redis
9
+ from nomenklatura.versions import Version
10
+ from nomenklatura.resolver import Linker, Identifier, StrIdent
11
+ from nomenklatura.store.base import Store, View, Writer
12
+
13
+ log = logging.getLogger(__name__)
14
+
15
+
16
+ def _pack_statement(stmt: Statement) -> bytes:
17
+ values = (
18
+ stmt.id,
19
+ stmt.entity_id,
20
+ stmt.dataset,
21
+ pack_prop(stmt.schema, stmt.prop),
22
+ stmt.value,
23
+ stmt.lang or 0,
24
+ stmt.original_value or 0,
25
+ stmt.first_seen,
26
+ stmt.last_seen,
27
+ 1 if stmt.external else 0,
28
+ )
29
+ return orjson.dumps(values)
30
+
31
+
32
+ def _unpack_statement(data: bytes, canonical_id: Optional[str] = None) -> Statement:
33
+ (
34
+ id,
35
+ entity_id,
36
+ dataset,
37
+ prop_id,
38
+ value,
39
+ lang,
40
+ original_value,
41
+ first_seen,
42
+ last_seen,
43
+ external,
44
+ ) = orjson.loads(data)
45
+ schema, _, prop = unpack_prop(prop_id)
46
+ return Statement(
47
+ id=id,
48
+ entity_id=entity_id,
49
+ prop=prop,
50
+ schema=schema,
51
+ value=value,
52
+ lang=None if lang == 0 else lang,
53
+ dataset=dataset,
54
+ original_value=None if original_value == 0 else original_value,
55
+ first_seen=first_seen,
56
+ last_seen=last_seen,
57
+ canonical_id=canonical_id or entity_id,
58
+ external=external == 1,
59
+ )
60
+
61
+
62
+ class VersionedRedisStore(Store[DS, SE]):
63
+ def __init__(
64
+ self,
65
+ dataset: DS,
66
+ linker: Linker[SE],
67
+ db: Optional["Redis[bytes]"] = None,
68
+ ):
69
+ super().__init__(dataset, linker)
70
+ if db is None:
71
+ db = get_redis()
72
+ self.db = db
73
+
74
+ def writer(
75
+ self,
76
+ dataset: Optional[DS] = None,
77
+ version: Optional[str] = None,
78
+ timestamps: bool = False,
79
+ ) -> "VersionedRedisWriter[DS, SE]":
80
+ if version is None:
81
+ version = Version.new().id
82
+ dataset = dataset or self.dataset
83
+ return VersionedRedisWriter(
84
+ self,
85
+ dataset=dataset,
86
+ version=version,
87
+ timestamps=timestamps,
88
+ )
89
+
90
+ def view(
91
+ self, scope: DS, external: bool = False, versions: Dict[str, str] = {}
92
+ ) -> "VersionedRedisView[DS, SE]":
93
+ return VersionedRedisView(self, scope, external=external, versions=versions)
94
+
95
+ def update(self, id: StrIdent) -> None:
96
+ # Noop because the VersionedStore is not resolved.
97
+ return
98
+
99
+ def get_latest(self, dataset: str) -> Optional[str]:
100
+ """Get the latest version of a dataset in the store."""
101
+ val = self.db.get(b(f"ds:{dataset}:latest"))
102
+ return val.decode("utf-8") if val is not None else None
103
+
104
+ def get_history(self, dataset: str) -> List[str]:
105
+ """List all versions of a dataset present in the store."""
106
+ values = self.db.lrange(f"ds:{dataset}:history", 0, -1)
107
+ return [v.decode("utf-8") for v in values]
108
+
109
+ def has_version(self, dataset: str, version: str) -> bool:
110
+ """Check if a specific version of a dataset exists in the store."""
111
+ return self.db.exists(f"ents:{dataset}:{version}") > 0
112
+
113
+ def release_version(self, dataset: str, version: str) -> None:
114
+ """Release the given version of the dataset (i.e. tag it as the latest
115
+ version in the relevant lookup key)."""
116
+ history_key = b(f"ds:{dataset}:history")
117
+ idx = self.db.lpos(history_key, b(version))
118
+ if idx is None:
119
+ self.db.lpush(history_key, b(version))
120
+ latest = self.db.lindex(history_key, 0)
121
+ if latest is not None:
122
+ self.db.set(b(f"ds:{dataset}:latest"), latest)
123
+ log.info("Released store version: %s (%s)", dataset, version)
124
+
125
+ def drop_version(self, dataset: str, version: str) -> None:
126
+ """Delete all data associated with a specific version of a dataset."""
127
+ pipeline = self.db.pipeline()
128
+ cmds = 0
129
+ for prefix in ["stmt", "ents", "inv"]:
130
+ query = f"{prefix}:{dataset}:{version}*"
131
+ for key in self.db.scan_iter(query):
132
+ pipeline.delete(key)
133
+ cmds += 1
134
+ if cmds > 1_000:
135
+ pipeline.execute()
136
+ pipeline = self.db.pipeline()
137
+ cmds = 0
138
+ if cmds > 0:
139
+ pipeline.execute()
140
+
141
+ # TODO: do we even want to remove the version from the history list?
142
+ self.db.lrem(f"ds:{dataset}:history", 0, b(version))
143
+ latest_key = f"ds:{dataset}:latest"
144
+ if b(version) == self.db.get(latest_key):
145
+ previous = self.db.lindex(b(f"ds:{dataset}:history"), 0)
146
+ if previous is not None:
147
+ self.db.set(latest_key, previous)
148
+ else:
149
+ self.db.delete(latest_key)
150
+ log.info("Dropped store version: %s (%s)", dataset, version)
151
+
152
+ def close(self) -> None:
153
+ close_redis()
154
+
155
+
156
+ class VersionedRedisWriter(Writer[DS, SE]):
157
+ BATCH_STATEMENTS = 2_000
158
+
159
+ def __init__(
160
+ self,
161
+ store: VersionedRedisStore[DS, SE],
162
+ dataset: DS,
163
+ version: str,
164
+ timestamps: bool = False,
165
+ ):
166
+ self.version = version
167
+ self.dataset = dataset
168
+ self.timestamps = timestamps
169
+ self.ver = f"{dataset.name}:{version}"
170
+ self.store: VersionedRedisStore[DS, SE] = store
171
+ self.prev = store.get_latest(dataset.name)
172
+ self.buffer: List[Statement] = []
173
+
174
+ def __enter__(self) -> "VersionedRedisWriter[DS, SE]":
175
+ return self
176
+
177
+ def flush(self) -> None:
178
+ db = self.store.db
179
+ pipeline = db.pipeline()
180
+
181
+ statements: Dict[str, Set[Statement]] = {}
182
+ for stmt in self.buffer:
183
+ if stmt.entity_id not in statements:
184
+ statements[stmt.entity_id] = set()
185
+ statements[stmt.entity_id].add(stmt)
186
+
187
+ if len(statements) == 0:
188
+ return
189
+
190
+ # Merge with previous version to get accurate first_seen timestamps
191
+ if self.timestamps and self.prev:
192
+ keys = [b(f"stmt:{self.prev}:{e}") for e in statements.keys()]
193
+ for v in db.sunion(keys):
194
+ pstmt = _unpack_statement(bv(v))
195
+ for stmt in self.buffer:
196
+ if pstmt.id == stmt.id:
197
+ stmt.first_seen = pstmt.first_seen
198
+ break
199
+
200
+ for entity_id, stmts in statements.items():
201
+ b_entity_id = b(entity_id)
202
+ pipeline.sadd(b(f"ents:{self.ver}"), b_entity_id)
203
+ values = [_pack_statement(s) for s in stmts]
204
+ pipeline.sadd(f"stmt:{self.ver}:{entity_id}", *values)
205
+
206
+ for stmt in stmts:
207
+ if stmt.prop_type == registry.entity.name:
208
+ pipeline.sadd(b(f"inv:{self.ver}:{stmt.value}"), b_entity_id)
209
+
210
+ pipeline.execute()
211
+ self.buffer = []
212
+
213
+ def release(self) -> None:
214
+ """Release the current version of the dataset (i.e. tag it as the latest
215
+ version in the relevant lookup key)."""
216
+ self.store.release_version(self.dataset.name, self.version)
217
+
218
+ def close(self) -> None:
219
+ self.release()
220
+ self.store.close()
221
+
222
+ def add_statement(self, stmt: Statement) -> None:
223
+ if stmt.entity_id is None:
224
+ return
225
+ self.buffer.append(stmt)
226
+ if len(self.buffer) >= self.BATCH_STATEMENTS:
227
+ self.flush()
228
+
229
+ def pop(self, entity_id: str) -> List[Statement]:
230
+ raise NotImplementedError()
231
+
232
+
233
+ class VersionedRedisView(View[DS, SE]):
234
+ def __init__(
235
+ self,
236
+ store: VersionedRedisStore[DS, SE],
237
+ scope: DS,
238
+ external: bool = False,
239
+ versions: Dict[str, str] = {},
240
+ ) -> None:
241
+ super().__init__(store, scope, external=external)
242
+ self.store: VersionedRedisStore[DS, SE] = store
243
+
244
+ # Get the latest version for each dataset in the scope
245
+ self.vers: List[Tuple[str, str]] = []
246
+ for ds in scope.leaf_names:
247
+ version = versions.get(ds, self.store.get_latest(ds))
248
+ if version is not None:
249
+ self.vers.append((ds, version))
250
+
251
+ def _get_stmt_keys(self, entity_id: str) -> List[str]:
252
+ keys: List[str] = []
253
+ ident = Identifier.get(entity_id)
254
+ for id in self.store.linker.connected(ident):
255
+ keys.extend([f"stmt:{d}:{v}:{id}" for d, v in self.vers])
256
+ return keys
257
+
258
+ def has_entity(self, id: str) -> bool:
259
+ # FIXME: this implementation does not account for the `external` flag
260
+ # correctly because it does not check the `stmt.external` field for
261
+ # each statement.
262
+ return self.store.db.exists(*self._get_stmt_keys(id)) > 0
263
+
264
+ def _get_statements(self, id: str) -> Generator[Statement, None, None]:
265
+ keys = self._get_stmt_keys(id)
266
+ if len(keys) == 0:
267
+ return None
268
+ elif len(keys) == 1:
269
+ stmts = self.store.db.smembers(keys[0])
270
+ else:
271
+ stmts = {bv(s) for s in self.store.db.sunion(keys)}
272
+ for v in stmts:
273
+ stmt = _unpack_statement(bv(v), id)
274
+ yield stmt
275
+
276
+ def get_timestamps(self, id: str) -> Dict[str, str]:
277
+ """Get the first seen timestamps associated with all statements of an entity.
278
+
279
+ Returns a dictionary mapping statement IDs to their first seen timestamps.
280
+ This can be used by an ETL to generate continuous entity histories.
281
+ """
282
+ timestamps: Dict[str, str] = {}
283
+ for stmt in self._get_statements(id):
284
+ if stmt.id is not None and stmt.first_seen is not None:
285
+ timestamps[stmt.id] = stmt.first_seen
286
+ return timestamps
287
+
288
+ def get_entity(self, id: str) -> Optional[SE]:
289
+ statements: List[Statement] = []
290
+ for stmt in self._get_statements(id):
291
+ if not stmt.external or self.external:
292
+ stmt.canonical_id = self.store.linker.get_canonical(stmt.entity_id)
293
+ if stmt.prop_type == registry.entity.name:
294
+ stmt.value = self.store.linker.get_canonical(stmt.value)
295
+ statements.append(stmt)
296
+ return self.store.assemble(statements)
297
+
298
+ def get_inverted(self, id: str) -> Generator[Tuple[Property, SE], None, None]:
299
+ keys: List[str] = []
300
+ ident = Identifier.get(id)
301
+ for ent_id in self.store.linker.connected(ident):
302
+ keys.extend([f"inv:{d}:{v}:{ent_id}" for d, v in self.vers])
303
+ refs = (
304
+ {bv(v) for v in self.store.db.sunion(keys)}
305
+ if len(keys) > 0
306
+ else self.store.db.smembers(keys[0])
307
+ )
308
+ entities: Set[str] = set()
309
+ for v in refs:
310
+ entity_id = v.decode("utf-8")
311
+ entities.add(self.store.linker.get_canonical(entity_id))
312
+ for entity_id in entities:
313
+ entity = self.get_entity(entity_id)
314
+ if entity is None:
315
+ continue
316
+ for prop, value in entity.itervalues():
317
+ if value == id and prop.reverse is not None:
318
+ yield prop.reverse, entity
319
+
320
+ def statements(self, resolve: bool = False) -> Generator[Statement, None, None]:
321
+ """Iterate over all statements in the view. If `resolve` is set to `True`,
322
+ canonical IDs are applied to the statement and its value.
323
+
324
+ NOTE: The `external` flag of the view will be used to filter statements, too.
325
+ """
326
+ for ds, ver in self.vers:
327
+ for id in self.store.db.sscan_iter(b(f"ents:{ds}:{ver}")):
328
+ entity_id = id.decode("utf-8")
329
+ stmt_key = f"stmt:{ds}:{ver}:{entity_id}"
330
+ for stmt_text in self.store.db.smembers(b(stmt_key)):
331
+ stmt = _unpack_statement(stmt_text, entity_id)
332
+ if stmt.external and not self.external:
333
+ continue
334
+ if resolve:
335
+ stmt = self.store.linker.apply_statement(stmt)
336
+ yield stmt
337
+
338
+ def entities(self, include_schemata: Optional[List[Schema]] = None) -> Generator[SE, None, None]:
339
+ if len(self.vers) == 0:
340
+ return
341
+ if len(self.vers) == 1:
342
+ scope_name = b(f"ents:{self.vers[0][0]}:{self.vers[0][1]}")
343
+ else:
344
+ version = Version.new().id + ":iter"
345
+ scope_name = b(f"ents:{self.scope.name}:{version}")
346
+ parts = [b(f"ents:{d}:{v}") for d, v in self.vers]
347
+ self.store.db.sunionstore(scope_name, parts)
348
+
349
+ # Keep track of canonical entities to avoid yielding the same
350
+ # de-duplicated entity multiple times. This intrinsically leaks
351
+ # memory, so we're being careful to only record entity IDs
352
+ # that are part of a cluster with more than one ID.
353
+ try:
354
+ seen: Set[str] = set()
355
+ for id in self.store.db.sscan_iter(scope_name):
356
+ entity_id = id.decode("utf-8")
357
+ ident = Identifier.get(entity_id)
358
+ connected = self.store.linker.connected(ident)
359
+ if len(connected) > 1:
360
+ canonical_id = max(connected).id
361
+ if canonical_id in seen:
362
+ continue
363
+ seen.add(canonical_id)
364
+ entity = self.get_entity(entity_id)
365
+ if entity is not None:
366
+ if include_schemata is not None and entity.schema not in include_schemata:
367
+ continue
368
+ yield entity
369
+ finally:
370
+ if len(self.vers) > 1:
371
+ self.store.db.delete(scope_name)
@@ -0,0 +1,17 @@
1
+ from typing import Optional
2
+ from followthemoney import DS, SE
3
+
4
+ from nomenklatura.store import Store
5
+
6
+ from nomenklatura.tui.app import DedupeApp, DedupeState
7
+ from nomenklatura.resolver import Resolver
8
+
9
+ __all__ = ["dedupe_ui"]
10
+
11
+
12
+ def dedupe_ui(
13
+ resolver: Resolver[SE], store: Store[DS, SE], url_base: Optional[str] = None
14
+ ) -> None:
15
+ app = DedupeApp()
16
+ app.dedupe = DedupeState(resolver, store, url_base=url_base)
17
+ app.run()