followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +146 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +55 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +1 -2
- followthemoney/model.py +38 -11
- followthemoney/names.py +33 -0
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +43 -64
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +30 -42
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +438 -0
- followthemoney/statement/serialize.py +251 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
from hashlib import sha1
|
|
2
|
+
from collections.abc import Mapping
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set, Type
|
|
4
|
+
from typing import Generator, Iterable, Tuple, TypeVar
|
|
5
|
+
|
|
6
|
+
from followthemoney.model import Model
|
|
7
|
+
from followthemoney.exc import InvalidData
|
|
8
|
+
from followthemoney.types.common import PropertyType
|
|
9
|
+
from followthemoney.property import Property
|
|
10
|
+
from followthemoney.util import gettext
|
|
11
|
+
from followthemoney.proxy import P
|
|
12
|
+
from followthemoney.types import registry
|
|
13
|
+
from followthemoney.value import string_list, Values
|
|
14
|
+
from followthemoney.proxy import EntityProxy
|
|
15
|
+
from followthemoney.dataset import Dataset, DefaultDataset
|
|
16
|
+
from followthemoney.statement.statement import Statement
|
|
17
|
+
from followthemoney.statement.util import BASE_ID
|
|
18
|
+
|
|
19
|
+
SE = TypeVar("SE", bound="StatementEntity")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class StatementEntity(EntityProxy):
|
|
23
|
+
"""An entity object that can link to a set of datasets that it is sourced from."""
|
|
24
|
+
|
|
25
|
+
__slots__ = (
|
|
26
|
+
"schema",
|
|
27
|
+
"id",
|
|
28
|
+
"_caption",
|
|
29
|
+
"extra_referents",
|
|
30
|
+
"dataset",
|
|
31
|
+
"last_change",
|
|
32
|
+
"_statements",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def __init__(self, dataset: Dataset, data: Dict[str, Any], cleaned: bool = True):
|
|
36
|
+
data = dict(data or {})
|
|
37
|
+
schema = Model.instance().get(data.pop("schema", None))
|
|
38
|
+
if schema is None:
|
|
39
|
+
raise InvalidData(gettext("No schema for entity."))
|
|
40
|
+
self.schema = schema
|
|
41
|
+
|
|
42
|
+
self._caption: Optional[str] = None
|
|
43
|
+
"""A pre-computed label for this entity."""
|
|
44
|
+
|
|
45
|
+
self.extra_referents: Set[str] = set(data.pop("referents", []))
|
|
46
|
+
"""The IDs of all entities which are included in this canonical entity."""
|
|
47
|
+
|
|
48
|
+
self.last_change: Optional[str] = data.get("last_change", None)
|
|
49
|
+
"""The last time this entity was changed."""
|
|
50
|
+
|
|
51
|
+
self.dataset = dataset
|
|
52
|
+
"""The default dataset for new statements."""
|
|
53
|
+
|
|
54
|
+
self.id: Optional[str] = data.pop("id", None)
|
|
55
|
+
self._statements: Dict[str, Set[Statement]] = {}
|
|
56
|
+
|
|
57
|
+
properties = data.pop("properties", None)
|
|
58
|
+
if isinstance(properties, Mapping):
|
|
59
|
+
for key, value in properties.items():
|
|
60
|
+
self.add(key, value, cleaned=cleaned, quiet=True)
|
|
61
|
+
|
|
62
|
+
for stmt_data in data.pop("statements", []):
|
|
63
|
+
stmt = Statement.from_dict(stmt_data)
|
|
64
|
+
if self.id is not None:
|
|
65
|
+
stmt.canonical_id = self.id
|
|
66
|
+
self.add_statement(stmt)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def _properties(self) -> Dict[str, List[str]]: # type: ignore
|
|
70
|
+
return {p: [s.value for s in v] for p, v in self._statements.items()}
|
|
71
|
+
|
|
72
|
+
def _iter_stmt(self) -> Generator[Statement, None, None]:
|
|
73
|
+
for stmts in self._statements.values():
|
|
74
|
+
for stmt in stmts:
|
|
75
|
+
if stmt.entity_id is None and self.id is not None:
|
|
76
|
+
stmt.entity_id = self.id
|
|
77
|
+
stmt.id = stmt.generate_key()
|
|
78
|
+
if stmt.id is None:
|
|
79
|
+
stmt.id = stmt.generate_key()
|
|
80
|
+
yield stmt
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def statements(self) -> Generator[Statement, None, None]:
|
|
84
|
+
"""Return all statements for this entity, with extra ID statement."""
|
|
85
|
+
ids: List[str] = []
|
|
86
|
+
last_seen: Set[str] = set()
|
|
87
|
+
first_seen: Set[str] = set()
|
|
88
|
+
for stmt in self._iter_stmt():
|
|
89
|
+
yield stmt
|
|
90
|
+
if stmt.id is not None:
|
|
91
|
+
ids.append(stmt.id)
|
|
92
|
+
if stmt.last_seen is not None:
|
|
93
|
+
last_seen.add(stmt.last_seen)
|
|
94
|
+
if stmt.first_seen is not None:
|
|
95
|
+
first_seen.add(stmt.first_seen)
|
|
96
|
+
if self.id is not None:
|
|
97
|
+
digest = sha1(self.schema.name.encode("utf-8"))
|
|
98
|
+
for id in sorted(ids):
|
|
99
|
+
digest.update(id.encode("utf-8"))
|
|
100
|
+
checksum = digest.hexdigest()
|
|
101
|
+
# This is to make the last_change value stable across
|
|
102
|
+
# serialisation:
|
|
103
|
+
first = self.last_change or min(first_seen, default=None)
|
|
104
|
+
yield Statement(
|
|
105
|
+
canonical_id=self.id,
|
|
106
|
+
entity_id=self.id,
|
|
107
|
+
prop=BASE_ID,
|
|
108
|
+
schema=self.schema.name,
|
|
109
|
+
value=checksum,
|
|
110
|
+
dataset=self.dataset.name,
|
|
111
|
+
first_seen=first,
|
|
112
|
+
last_seen=max(last_seen, default=None),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def first_seen(self) -> Optional[str]:
|
|
117
|
+
seen = (s.first_seen for s in self._iter_stmt() if s.first_seen is not None)
|
|
118
|
+
return min(seen, default=None)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def last_seen(self) -> Optional[str]:
|
|
122
|
+
seen = (s.last_seen for s in self._iter_stmt() if s.last_seen is not None)
|
|
123
|
+
return max(seen, default=None)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def datasets(self) -> Set[str]:
|
|
127
|
+
datasets: Set[str] = set()
|
|
128
|
+
for stmt in self._iter_stmt():
|
|
129
|
+
datasets.add(stmt.dataset)
|
|
130
|
+
return datasets
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def referents(self) -> Set[str]:
|
|
134
|
+
referents: Set[str] = set(self.extra_referents)
|
|
135
|
+
for stmt in self._iter_stmt():
|
|
136
|
+
if stmt.entity_id is not None and stmt.entity_id != self.id:
|
|
137
|
+
referents.add(stmt.entity_id)
|
|
138
|
+
return referents
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def key_prefix(self) -> Optional[str]:
|
|
142
|
+
return self.dataset.name
|
|
143
|
+
|
|
144
|
+
@key_prefix.setter
|
|
145
|
+
def key_prefix(self, dataset: Optional[str]) -> None:
|
|
146
|
+
raise NotImplementedError()
|
|
147
|
+
|
|
148
|
+
def add_statement(self, stmt: Statement) -> None:
|
|
149
|
+
schema = self.schema
|
|
150
|
+
if schema.name != stmt.schema and not schema.is_a(stmt.schema):
|
|
151
|
+
try:
|
|
152
|
+
self.schema = schema.model.common_schema(schema, stmt.schema)
|
|
153
|
+
except InvalidData as exc:
|
|
154
|
+
raise InvalidData(f"{self.id}: {exc}") from exc
|
|
155
|
+
|
|
156
|
+
if stmt.prop == BASE_ID:
|
|
157
|
+
if stmt.first_seen is not None:
|
|
158
|
+
# The last_change attribute describes the latest checksum change
|
|
159
|
+
# of any emitted component of the entity, which is stored in the BASE
|
|
160
|
+
# field.
|
|
161
|
+
if self.last_change is None:
|
|
162
|
+
self.last_change = stmt.first_seen
|
|
163
|
+
else:
|
|
164
|
+
self.last_change = max(self.last_change, stmt.first_seen)
|
|
165
|
+
else:
|
|
166
|
+
if stmt.prop not in self._statements:
|
|
167
|
+
self._statements[stmt.prop] = set()
|
|
168
|
+
self._statements[stmt.prop].add(stmt)
|
|
169
|
+
|
|
170
|
+
def get(self, prop: P, quiet: bool = False) -> List[str]:
|
|
171
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
172
|
+
if prop_name is None or prop_name not in self._statements:
|
|
173
|
+
return []
|
|
174
|
+
return list({s.value for s in self._statements[prop_name]})
|
|
175
|
+
|
|
176
|
+
def get_statements(self, prop: P, quiet: bool = False) -> List[Statement]:
|
|
177
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
178
|
+
if prop_name is None or prop_name not in self._statements:
|
|
179
|
+
return []
|
|
180
|
+
return list(self._statements[prop_name])
|
|
181
|
+
|
|
182
|
+
def set(
|
|
183
|
+
self,
|
|
184
|
+
prop: P,
|
|
185
|
+
values: Values,
|
|
186
|
+
cleaned: bool = False,
|
|
187
|
+
quiet: bool = False,
|
|
188
|
+
fuzzy: bool = False,
|
|
189
|
+
format: Optional[str] = None,
|
|
190
|
+
lang: Optional[str] = None,
|
|
191
|
+
original_value: Optional[str] = None,
|
|
192
|
+
origin: Optional[str] = None,
|
|
193
|
+
) -> None:
|
|
194
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
195
|
+
if prop_name is None:
|
|
196
|
+
return
|
|
197
|
+
self._statements.pop(prop_name, None)
|
|
198
|
+
return self.add(
|
|
199
|
+
prop,
|
|
200
|
+
values,
|
|
201
|
+
cleaned=cleaned,
|
|
202
|
+
quiet=quiet,
|
|
203
|
+
fuzzy=fuzzy,
|
|
204
|
+
format=format,
|
|
205
|
+
lang=lang,
|
|
206
|
+
original_value=original_value,
|
|
207
|
+
origin=origin,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def add(
|
|
211
|
+
self,
|
|
212
|
+
prop: P,
|
|
213
|
+
values: Values,
|
|
214
|
+
cleaned: bool = False,
|
|
215
|
+
quiet: bool = False,
|
|
216
|
+
fuzzy: bool = False,
|
|
217
|
+
format: Optional[str] = None,
|
|
218
|
+
lang: Optional[str] = None,
|
|
219
|
+
original_value: Optional[str] = None,
|
|
220
|
+
origin: Optional[str] = None,
|
|
221
|
+
) -> None:
|
|
222
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
223
|
+
if prop_name is None:
|
|
224
|
+
return None
|
|
225
|
+
prop = self.schema.properties[prop_name]
|
|
226
|
+
for value in string_list(values, sanitize=not cleaned):
|
|
227
|
+
self.unsafe_add(
|
|
228
|
+
prop,
|
|
229
|
+
value,
|
|
230
|
+
cleaned=cleaned,
|
|
231
|
+
fuzzy=fuzzy,
|
|
232
|
+
format=format,
|
|
233
|
+
quiet=quiet,
|
|
234
|
+
lang=lang,
|
|
235
|
+
original_value=original_value,
|
|
236
|
+
origin=origin,
|
|
237
|
+
)
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
def unsafe_add(
|
|
241
|
+
self,
|
|
242
|
+
prop: Property,
|
|
243
|
+
value: Optional[str],
|
|
244
|
+
cleaned: bool = False,
|
|
245
|
+
fuzzy: bool = False,
|
|
246
|
+
format: Optional[str] = None,
|
|
247
|
+
quiet: bool = False,
|
|
248
|
+
schema: Optional[str] = None,
|
|
249
|
+
dataset: Optional[str] = None,
|
|
250
|
+
seen: Optional[str] = None,
|
|
251
|
+
lang: Optional[str] = None,
|
|
252
|
+
original_value: Optional[str] = None,
|
|
253
|
+
origin: Optional[str] = None,
|
|
254
|
+
) -> Optional[str]:
|
|
255
|
+
"""Add a statement to the entity, possibly the value."""
|
|
256
|
+
if value is None or len(value) == 0:
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
# Don't allow setting the reverse properties:
|
|
260
|
+
if prop.stub:
|
|
261
|
+
if quiet:
|
|
262
|
+
return None
|
|
263
|
+
msg = gettext("Stub property (%s): %s")
|
|
264
|
+
raise InvalidData(msg % (self.schema, prop))
|
|
265
|
+
|
|
266
|
+
if lang is not None:
|
|
267
|
+
lang = registry.language.clean_text(lang)
|
|
268
|
+
|
|
269
|
+
clean: Optional[str] = value
|
|
270
|
+
if not cleaned:
|
|
271
|
+
clean = prop.type.clean_text(value, proxy=self, fuzzy=fuzzy, format=format)
|
|
272
|
+
|
|
273
|
+
if clean is None:
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
if original_value is None and clean != value:
|
|
277
|
+
original_value = value
|
|
278
|
+
|
|
279
|
+
if self.id is None:
|
|
280
|
+
raise InvalidData("Cannot add statement to entity without ID!")
|
|
281
|
+
stmt = Statement(
|
|
282
|
+
entity_id=self.id,
|
|
283
|
+
prop=prop.name,
|
|
284
|
+
schema=schema or self.schema.name,
|
|
285
|
+
value=clean,
|
|
286
|
+
dataset=dataset or self.dataset.name,
|
|
287
|
+
lang=lang,
|
|
288
|
+
original_value=original_value,
|
|
289
|
+
first_seen=seen,
|
|
290
|
+
origin=origin,
|
|
291
|
+
)
|
|
292
|
+
self.add_statement(stmt)
|
|
293
|
+
return clean
|
|
294
|
+
|
|
295
|
+
def pop(self, prop: P, quiet: bool = True) -> List[str]:
|
|
296
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
297
|
+
if prop_name is None or prop_name not in self._statements:
|
|
298
|
+
return []
|
|
299
|
+
return list({s.value for s in self._statements.pop(prop_name, [])})
|
|
300
|
+
|
|
301
|
+
def remove(self, prop: P, value: str, quiet: bool = True) -> None:
|
|
302
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
303
|
+
if prop_name is not None and prop_name in self._properties:
|
|
304
|
+
stmts = {s for s in self._statements[prop_name] if s.value != value}
|
|
305
|
+
self._statements[prop_name] = stmts
|
|
306
|
+
|
|
307
|
+
def itervalues(self) -> Generator[Tuple[Property, str], None, None]:
|
|
308
|
+
for name, statements in self._statements.items():
|
|
309
|
+
prop = self.schema.properties[name]
|
|
310
|
+
for value in set((s.value for s in statements)):
|
|
311
|
+
yield (prop, value)
|
|
312
|
+
|
|
313
|
+
def get_type_values(
|
|
314
|
+
self, type_: PropertyType, matchable: bool = False
|
|
315
|
+
) -> List[str]:
|
|
316
|
+
combined: Set[str] = set()
|
|
317
|
+
for stmt in self.get_type_statements(type_, matchable=matchable):
|
|
318
|
+
combined.add(stmt.value)
|
|
319
|
+
return list(combined)
|
|
320
|
+
|
|
321
|
+
def get_type_statements(
|
|
322
|
+
self, type_: PropertyType, matchable: bool = False
|
|
323
|
+
) -> List[Statement]:
|
|
324
|
+
combined = []
|
|
325
|
+
for prop_name, statements in self._statements.items():
|
|
326
|
+
prop = self.schema.properties[prop_name]
|
|
327
|
+
if matchable and not prop.matchable:
|
|
328
|
+
continue
|
|
329
|
+
if prop.type == type_:
|
|
330
|
+
for statement in statements:
|
|
331
|
+
combined.append(statement)
|
|
332
|
+
return combined
|
|
333
|
+
|
|
334
|
+
@property
|
|
335
|
+
def properties(self) -> Dict[str, List[str]]:
|
|
336
|
+
return {p: list({s.value for s in vs}) for p, vs in self._statements.items()}
|
|
337
|
+
|
|
338
|
+
def iterprops(self) -> List[Property]:
|
|
339
|
+
return [self.schema.properties[p] for p in self._statements.keys()]
|
|
340
|
+
|
|
341
|
+
def clone(self: SE) -> SE:
|
|
342
|
+
data = {"schema": self.schema.name, "id": self.id}
|
|
343
|
+
cloned = type(self)(self.dataset, data)
|
|
344
|
+
for stmt in self._iter_stmt():
|
|
345
|
+
cloned.add_statement(stmt)
|
|
346
|
+
return cloned
|
|
347
|
+
|
|
348
|
+
def merge(self: SE, other: EntityProxy) -> SE:
|
|
349
|
+
try:
|
|
350
|
+
self.schema = self.schema.model.common_schema(self.schema, other.schema)
|
|
351
|
+
except InvalidData as e:
|
|
352
|
+
msg = "Cannot merge entities with id %s: %s"
|
|
353
|
+
raise InvalidData(msg % (self.id, e))
|
|
354
|
+
|
|
355
|
+
if not isinstance(other, StatementEntity):
|
|
356
|
+
for prop, values in other._properties.items():
|
|
357
|
+
self.add(prop, values, cleaned=True, quiet=True)
|
|
358
|
+
return self
|
|
359
|
+
for stmt in other._iter_stmt():
|
|
360
|
+
if self.id is not None:
|
|
361
|
+
stmt.canonical_id = self.id
|
|
362
|
+
self.add_statement(stmt)
|
|
363
|
+
self.extra_referents.update(other.extra_referents)
|
|
364
|
+
return self
|
|
365
|
+
|
|
366
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
367
|
+
data: Dict[str, Any] = {
|
|
368
|
+
"id": self.id,
|
|
369
|
+
"caption": self.caption,
|
|
370
|
+
"schema": self.schema.name,
|
|
371
|
+
"properties": self.properties,
|
|
372
|
+
"referents": list(self.referents),
|
|
373
|
+
"datasets": list(self.datasets),
|
|
374
|
+
}
|
|
375
|
+
if self.first_seen is not None:
|
|
376
|
+
data["first_seen"] = self.first_seen
|
|
377
|
+
if self.last_seen is not None:
|
|
378
|
+
data["last_seen"] = self.last_seen
|
|
379
|
+
if self.last_change is not None:
|
|
380
|
+
data["last_change"] = self.last_change
|
|
381
|
+
return data
|
|
382
|
+
|
|
383
|
+
def to_statement_dict(self) -> Dict[str, Any]:
|
|
384
|
+
"""Return a dictionary representation of the entity's statements."""
|
|
385
|
+
data: Dict[str, Any] = {
|
|
386
|
+
"id": self.id,
|
|
387
|
+
"caption": self.caption,
|
|
388
|
+
"schema": self.schema.name,
|
|
389
|
+
"statements": [stmt.to_dict() for stmt in self.statements],
|
|
390
|
+
"referents": list(self.referents),
|
|
391
|
+
"datasets": list(self.datasets),
|
|
392
|
+
}
|
|
393
|
+
if self.first_seen is not None:
|
|
394
|
+
data["first_seen"] = self.first_seen
|
|
395
|
+
if self.last_seen is not None:
|
|
396
|
+
data["last_seen"] = self.last_seen
|
|
397
|
+
if self.last_change is not None:
|
|
398
|
+
data["last_change"] = self.last_change
|
|
399
|
+
return data
|
|
400
|
+
|
|
401
|
+
def __len__(self) -> int:
|
|
402
|
+
return len(list(self._iter_stmt())) + 1
|
|
403
|
+
|
|
404
|
+
@classmethod
|
|
405
|
+
def from_dict(
|
|
406
|
+
cls: Type[SE],
|
|
407
|
+
data: Dict[str, Any],
|
|
408
|
+
cleaned: bool = True,
|
|
409
|
+
default_dataset: Optional[Dataset] = None,
|
|
410
|
+
) -> SE:
|
|
411
|
+
# Exists only for backwards compatibility.
|
|
412
|
+
dataset = default_dataset or DefaultDataset
|
|
413
|
+
return cls(dataset, data, cleaned=cleaned)
|
|
414
|
+
|
|
415
|
+
@classmethod
|
|
416
|
+
def from_data(
|
|
417
|
+
cls: Type[SE],
|
|
418
|
+
dataset: Dataset,
|
|
419
|
+
data: Dict[str, Any],
|
|
420
|
+
cleaned: bool = True,
|
|
421
|
+
) -> SE:
|
|
422
|
+
return cls(dataset, data, cleaned=cleaned)
|
|
423
|
+
|
|
424
|
+
@classmethod
|
|
425
|
+
def from_statements(
|
|
426
|
+
cls: Type[SE],
|
|
427
|
+
dataset: Dataset,
|
|
428
|
+
statements: Iterable[Statement],
|
|
429
|
+
) -> SE:
|
|
430
|
+
obj: Optional[SE] = None
|
|
431
|
+
for stmt in statements:
|
|
432
|
+
if obj is None:
|
|
433
|
+
data = {"schema": stmt.schema, "id": stmt.canonical_id}
|
|
434
|
+
obj = cls(dataset, data)
|
|
435
|
+
obj.add_statement(stmt)
|
|
436
|
+
if obj is None:
|
|
437
|
+
raise ValueError("No statements given!")
|
|
438
|
+
return obj
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import click
|
|
3
|
+
import orjson
|
|
4
|
+
import logging
|
|
5
|
+
from io import TextIOWrapper
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from types import TracebackType
|
|
8
|
+
from typing import cast
|
|
9
|
+
from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
|
|
10
|
+
from rigour.boolean import text_bool
|
|
11
|
+
|
|
12
|
+
from followthemoney.statement.statement import Statement, StatementDict
|
|
13
|
+
from followthemoney.statement.util import unpack_prop
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
JSON = "json"
|
|
18
|
+
CSV = "csv"
|
|
19
|
+
PACK = "pack"
|
|
20
|
+
FORMATS = [JSON, CSV, PACK]
|
|
21
|
+
|
|
22
|
+
CSV_BATCH = 5000
|
|
23
|
+
CSV_COLUMNS = [
|
|
24
|
+
"canonical_id",
|
|
25
|
+
"entity_id",
|
|
26
|
+
"prop",
|
|
27
|
+
"prop_type",
|
|
28
|
+
"schema",
|
|
29
|
+
"value",
|
|
30
|
+
"dataset",
|
|
31
|
+
"origin",
|
|
32
|
+
"lang",
|
|
33
|
+
"original_value",
|
|
34
|
+
"external",
|
|
35
|
+
"first_seen",
|
|
36
|
+
"last_seen",
|
|
37
|
+
"id",
|
|
38
|
+
]
|
|
39
|
+
LEGACY_PACK_COLUMNS = [
|
|
40
|
+
"entity_id",
|
|
41
|
+
"prop",
|
|
42
|
+
"value",
|
|
43
|
+
"dataset",
|
|
44
|
+
"lang",
|
|
45
|
+
"original_value",
|
|
46
|
+
"target",
|
|
47
|
+
"external",
|
|
48
|
+
"first_seen",
|
|
49
|
+
"last_seen",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def read_json_statements(
|
|
54
|
+
fh: BinaryIO,
|
|
55
|
+
max_line: int = 40 * 1024 * 1024,
|
|
56
|
+
) -> Generator[Statement, None, None]:
|
|
57
|
+
while line := fh.readline(max_line):
|
|
58
|
+
data = orjson.loads(line)
|
|
59
|
+
yield Statement.from_dict(data)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
63
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
64
|
+
for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
|
|
65
|
+
data = cast(StatementDict, row)
|
|
66
|
+
data["external"] = text_bool(row.get("external")) or False
|
|
67
|
+
if row.get("lang") == "":
|
|
68
|
+
data["lang"] = None
|
|
69
|
+
if row.get("original_value") == "":
|
|
70
|
+
data["original_value"] = None
|
|
71
|
+
yield Statement.from_dict(data)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
75
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
76
|
+
yield from read_pack_statements_decoded(wrapped)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]:
|
|
80
|
+
headers: Optional[List[str]] = None
|
|
81
|
+
for row in csv.reader(fh, dialect=csv.unix_dialect):
|
|
82
|
+
if headers is None:
|
|
83
|
+
if "entity_id" in row and "prop" in row:
|
|
84
|
+
headers = row
|
|
85
|
+
else:
|
|
86
|
+
# This is a legacy pack file, with no headers.
|
|
87
|
+
headers = LEGACY_PACK_COLUMNS
|
|
88
|
+
continue
|
|
89
|
+
data = dict(zip(headers, row))
|
|
90
|
+
try:
|
|
91
|
+
schema, _, prop = unpack_prop(data["prop"])
|
|
92
|
+
except TypeError:
|
|
93
|
+
log.error("Invalid property in pack statement: %s" % data["prop"])
|
|
94
|
+
continue
|
|
95
|
+
yield Statement(
|
|
96
|
+
entity_id=data["entity_id"],
|
|
97
|
+
prop=prop,
|
|
98
|
+
schema=schema,
|
|
99
|
+
value=data["value"],
|
|
100
|
+
dataset=data["dataset"],
|
|
101
|
+
lang=data["lang"] or None,
|
|
102
|
+
original_value=data["original_value"] or None,
|
|
103
|
+
origin=data.get("origin"),
|
|
104
|
+
first_seen=data["first_seen"],
|
|
105
|
+
external=data["external"] == "t",
|
|
106
|
+
canonical_id=data["entity_id"],
|
|
107
|
+
last_seen=data["last_seen"],
|
|
108
|
+
id=data.get("id"),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def read_statements(fh: BinaryIO, format: str) -> Generator[Statement, None, None]:
|
|
113
|
+
if format == CSV:
|
|
114
|
+
yield from read_csv_statements(fh)
|
|
115
|
+
elif format == PACK:
|
|
116
|
+
yield from read_pack_statements(fh)
|
|
117
|
+
else:
|
|
118
|
+
yield from read_json_statements(fh)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def read_path_statements(path: Path, format: str) -> Generator[Statement, None, None]:
|
|
122
|
+
if str(path) == "-":
|
|
123
|
+
fh = click.get_binary_stream("stdin")
|
|
124
|
+
yield from read_statements(fh, format=format)
|
|
125
|
+
return
|
|
126
|
+
with open(path, "rb") as fh:
|
|
127
|
+
yield from read_statements(fh, format=format)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
|
|
131
|
+
if format == CSV:
|
|
132
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
133
|
+
return CSVStatementWriter(wrapped)
|
|
134
|
+
elif format == PACK:
|
|
135
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
136
|
+
return PackStatementWriter(wrapped)
|
|
137
|
+
elif format == JSON:
|
|
138
|
+
return JSONStatementWriter(fh)
|
|
139
|
+
raise RuntimeError("Unknown statement format: %s" % format)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def write_statements(
|
|
143
|
+
fh: BinaryIO, format: str, statements: Iterable[Statement]
|
|
144
|
+
) -> None:
|
|
145
|
+
writer = get_statement_writer(fh, format)
|
|
146
|
+
for stmt in statements:
|
|
147
|
+
writer.write(stmt)
|
|
148
|
+
writer.close()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class StatementWriter(object):
|
|
152
|
+
def write(self, stmt: Statement) -> None:
|
|
153
|
+
raise NotImplementedError()
|
|
154
|
+
|
|
155
|
+
def close(self) -> None:
|
|
156
|
+
raise NotImplementedError()
|
|
157
|
+
|
|
158
|
+
def __enter__(self) -> "StatementWriter":
|
|
159
|
+
return self
|
|
160
|
+
|
|
161
|
+
def __exit__(
|
|
162
|
+
self,
|
|
163
|
+
type: Optional[Type[BaseException]],
|
|
164
|
+
value: Optional[BaseException],
|
|
165
|
+
traceback: Optional[TracebackType],
|
|
166
|
+
) -> None:
|
|
167
|
+
self.close()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class JSONStatementWriter(StatementWriter):
|
|
171
|
+
def __init__(self, fh: BinaryIO) -> None:
|
|
172
|
+
self.fh = fh
|
|
173
|
+
|
|
174
|
+
def write(self, stmt: Statement) -> None:
|
|
175
|
+
data = stmt.to_dict()
|
|
176
|
+
out = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
|
|
177
|
+
self.fh.write(out)
|
|
178
|
+
|
|
179
|
+
def close(self) -> None:
|
|
180
|
+
self.fh.close()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class CSVStatementWriter(StatementWriter):
|
|
184
|
+
def __init__(self, fh: TextIO) -> None:
|
|
185
|
+
self.fh = fh
|
|
186
|
+
self.writer = csv.writer(self.fh, dialect=csv.unix_dialect)
|
|
187
|
+
self.writer.writerow(CSV_COLUMNS)
|
|
188
|
+
self._batch: List[List[Optional[str]]] = []
|
|
189
|
+
|
|
190
|
+
def write(self, stmt: Statement) -> None:
|
|
191
|
+
row = stmt.to_csv_row()
|
|
192
|
+
self._batch.append([row[c] for c in CSV_COLUMNS])
|
|
193
|
+
if len(self._batch) >= CSV_BATCH:
|
|
194
|
+
self.writer.writerows(self._batch)
|
|
195
|
+
self._batch.clear()
|
|
196
|
+
|
|
197
|
+
def close(self) -> None:
|
|
198
|
+
if len(self._batch) > 0:
|
|
199
|
+
self.writer.writerows(self._batch)
|
|
200
|
+
self.fh.close()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class PackStatementWriter(StatementWriter):
|
|
204
|
+
def __init__(self, fh: TextIO) -> None:
|
|
205
|
+
self.fh = fh
|
|
206
|
+
self.writer = csv.writer(
|
|
207
|
+
self.fh,
|
|
208
|
+
dialect=csv.unix_dialect,
|
|
209
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
210
|
+
)
|
|
211
|
+
columns = [
|
|
212
|
+
"entity_id",
|
|
213
|
+
"prop",
|
|
214
|
+
"value",
|
|
215
|
+
"dataset",
|
|
216
|
+
"lang",
|
|
217
|
+
"original_value",
|
|
218
|
+
"origin",
|
|
219
|
+
"external",
|
|
220
|
+
"first_seen",
|
|
221
|
+
"last_seen",
|
|
222
|
+
"id",
|
|
223
|
+
]
|
|
224
|
+
self.writer.writerow(columns)
|
|
225
|
+
self._batch: List[List[Optional[str]]] = []
|
|
226
|
+
|
|
227
|
+
def write(self, stmt: Statement) -> None:
|
|
228
|
+
# HACK: This is very similar to the CSV writer, but at the very inner
|
|
229
|
+
# loop of the application, so we're duplicating code here.
|
|
230
|
+
row = [
|
|
231
|
+
stmt.entity_id,
|
|
232
|
+
f"{stmt.schema}:{stmt.prop}",
|
|
233
|
+
stmt.value,
|
|
234
|
+
stmt.dataset,
|
|
235
|
+
stmt.lang,
|
|
236
|
+
stmt.original_value,
|
|
237
|
+
stmt.origin,
|
|
238
|
+
"t" if stmt.external else None,
|
|
239
|
+
stmt.first_seen,
|
|
240
|
+
stmt.last_seen,
|
|
241
|
+
stmt.id,
|
|
242
|
+
]
|
|
243
|
+
self._batch.append(row)
|
|
244
|
+
if len(self._batch) >= CSV_BATCH:
|
|
245
|
+
self.writer.writerows(self._batch)
|
|
246
|
+
self._batch.clear()
|
|
247
|
+
|
|
248
|
+
def close(self) -> None:
|
|
249
|
+
if len(self._batch) > 0:
|
|
250
|
+
self.writer.writerows(self._batch)
|
|
251
|
+
self.fh.close()
|