followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. followthemoney/__init__.py +30 -10
  2. followthemoney/cli/cli.py +1 -1
  3. followthemoney/cli/exports.py +6 -2
  4. followthemoney/cli/statement.py +62 -0
  5. followthemoney/cli/util.py +2 -3
  6. followthemoney/compare.py +26 -16
  7. followthemoney/dataset/__init__.py +17 -0
  8. followthemoney/dataset/catalog.py +77 -0
  9. followthemoney/dataset/coverage.py +29 -0
  10. followthemoney/dataset/dataset.py +146 -0
  11. followthemoney/dataset/publisher.py +25 -0
  12. followthemoney/dataset/resource.py +30 -0
  13. followthemoney/dataset/util.py +55 -0
  14. followthemoney/entity.py +73 -0
  15. followthemoney/exc.py +6 -0
  16. followthemoney/export/rdf.py +57 -5
  17. followthemoney/graph.py +1 -2
  18. followthemoney/model.py +38 -11
  19. followthemoney/names.py +33 -0
  20. followthemoney/ontology.py +18 -16
  21. followthemoney/property.py +12 -15
  22. followthemoney/proxy.py +43 -64
  23. followthemoney/schema/Analyzable.yaml +2 -3
  24. followthemoney/schema/BankAccount.yaml +2 -3
  25. followthemoney/schema/Company.yaml +0 -6
  26. followthemoney/schema/Contract.yaml +0 -1
  27. followthemoney/schema/CryptoWallet.yaml +1 -1
  28. followthemoney/schema/Document.yaml +0 -6
  29. followthemoney/schema/Interval.yaml +7 -0
  30. followthemoney/schema/LegalEntity.yaml +6 -0
  31. followthemoney/schema/License.yaml +2 -0
  32. followthemoney/schema/Page.yaml +0 -1
  33. followthemoney/schema/Person.yaml +0 -5
  34. followthemoney/schema/Sanction.yaml +1 -0
  35. followthemoney/schema/Thing.yaml +0 -2
  36. followthemoney/schema/UserAccount.yaml +6 -3
  37. followthemoney/schema.py +30 -42
  38. followthemoney/statement/__init__.py +19 -0
  39. followthemoney/statement/entity.py +438 -0
  40. followthemoney/statement/serialize.py +251 -0
  41. followthemoney/statement/statement.py +256 -0
  42. followthemoney/statement/util.py +31 -0
  43. followthemoney/types/__init__.py +66 -23
  44. followthemoney/types/address.py +3 -3
  45. followthemoney/types/checksum.py +3 -7
  46. followthemoney/types/common.py +9 -14
  47. followthemoney/types/country.py +3 -7
  48. followthemoney/types/date.py +21 -11
  49. followthemoney/types/email.py +0 -4
  50. followthemoney/types/entity.py +5 -11
  51. followthemoney/types/gender.py +6 -10
  52. followthemoney/types/identifier.py +9 -3
  53. followthemoney/types/ip.py +5 -9
  54. followthemoney/types/json.py +2 -2
  55. followthemoney/types/language.py +3 -7
  56. followthemoney/types/mimetype.py +4 -8
  57. followthemoney/types/name.py +7 -8
  58. followthemoney/types/number.py +88 -6
  59. followthemoney/types/phone.py +4 -11
  60. followthemoney/types/string.py +4 -4
  61. followthemoney/types/topic.py +3 -7
  62. followthemoney/types/url.py +5 -10
  63. followthemoney/util.py +12 -13
  64. followthemoney/value.py +67 -0
  65. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
  66. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
  67. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
  68. followthemoney/offshore.py +0 -48
  69. followthemoney/rdf.py +0 -9
  70. followthemoney/schema/Assessment.yaml +0 -32
  71. followthemoney/schema/Post.yaml +0 -42
  72. followthemoney/types/iban.py +0 -58
  73. followthemoney/types/registry.py +0 -52
  74. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
  75. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,438 @@
1
+ from hashlib import sha1
2
+ from collections.abc import Mapping
3
+ from typing import Any, Dict, List, Optional, Set, Type
4
+ from typing import Generator, Iterable, Tuple, TypeVar
5
+
6
+ from followthemoney.model import Model
7
+ from followthemoney.exc import InvalidData
8
+ from followthemoney.types.common import PropertyType
9
+ from followthemoney.property import Property
10
+ from followthemoney.util import gettext
11
+ from followthemoney.proxy import P
12
+ from followthemoney.types import registry
13
+ from followthemoney.value import string_list, Values
14
+ from followthemoney.proxy import EntityProxy
15
+ from followthemoney.dataset import Dataset, DefaultDataset
16
+ from followthemoney.statement.statement import Statement
17
+ from followthemoney.statement.util import BASE_ID
18
+
19
+ SE = TypeVar("SE", bound="StatementEntity")
20
+
21
+
22
+ class StatementEntity(EntityProxy):
23
+ """An entity object that can link to a set of datasets that it is sourced from."""
24
+
25
+ __slots__ = (
26
+ "schema",
27
+ "id",
28
+ "_caption",
29
+ "extra_referents",
30
+ "dataset",
31
+ "last_change",
32
+ "_statements",
33
+ )
34
+
35
+ def __init__(self, dataset: Dataset, data: Dict[str, Any], cleaned: bool = True):
36
+ data = dict(data or {})
37
+ schema = Model.instance().get(data.pop("schema", None))
38
+ if schema is None:
39
+ raise InvalidData(gettext("No schema for entity."))
40
+ self.schema = schema
41
+
42
+ self._caption: Optional[str] = None
43
+ """A pre-computed label for this entity."""
44
+
45
+ self.extra_referents: Set[str] = set(data.pop("referents", []))
46
+ """The IDs of all entities which are included in this canonical entity."""
47
+
48
+ self.last_change: Optional[str] = data.get("last_change", None)
49
+ """The last time this entity was changed."""
50
+
51
+ self.dataset = dataset
52
+ """The default dataset for new statements."""
53
+
54
+ self.id: Optional[str] = data.pop("id", None)
55
+ self._statements: Dict[str, Set[Statement]] = {}
56
+
57
+ properties = data.pop("properties", None)
58
+ if isinstance(properties, Mapping):
59
+ for key, value in properties.items():
60
+ self.add(key, value, cleaned=cleaned, quiet=True)
61
+
62
+ for stmt_data in data.pop("statements", []):
63
+ stmt = Statement.from_dict(stmt_data)
64
+ if self.id is not None:
65
+ stmt.canonical_id = self.id
66
+ self.add_statement(stmt)
67
+
68
+ @property
69
+ def _properties(self) -> Dict[str, List[str]]: # type: ignore
70
+ return {p: [s.value for s in v] for p, v in self._statements.items()}
71
+
72
+ def _iter_stmt(self) -> Generator[Statement, None, None]:
73
+ for stmts in self._statements.values():
74
+ for stmt in stmts:
75
+ if stmt.entity_id is None and self.id is not None:
76
+ stmt.entity_id = self.id
77
+ stmt.id = stmt.generate_key()
78
+ if stmt.id is None:
79
+ stmt.id = stmt.generate_key()
80
+ yield stmt
81
+
82
+ @property
83
+ def statements(self) -> Generator[Statement, None, None]:
84
+ """Return all statements for this entity, with extra ID statement."""
85
+ ids: List[str] = []
86
+ last_seen: Set[str] = set()
87
+ first_seen: Set[str] = set()
88
+ for stmt in self._iter_stmt():
89
+ yield stmt
90
+ if stmt.id is not None:
91
+ ids.append(stmt.id)
92
+ if stmt.last_seen is not None:
93
+ last_seen.add(stmt.last_seen)
94
+ if stmt.first_seen is not None:
95
+ first_seen.add(stmt.first_seen)
96
+ if self.id is not None:
97
+ digest = sha1(self.schema.name.encode("utf-8"))
98
+ for id in sorted(ids):
99
+ digest.update(id.encode("utf-8"))
100
+ checksum = digest.hexdigest()
101
+ # This is to make the last_change value stable across
102
+ # serialisation:
103
+ first = self.last_change or min(first_seen, default=None)
104
+ yield Statement(
105
+ canonical_id=self.id,
106
+ entity_id=self.id,
107
+ prop=BASE_ID,
108
+ schema=self.schema.name,
109
+ value=checksum,
110
+ dataset=self.dataset.name,
111
+ first_seen=first,
112
+ last_seen=max(last_seen, default=None),
113
+ )
114
+
115
+ @property
116
+ def first_seen(self) -> Optional[str]:
117
+ seen = (s.first_seen for s in self._iter_stmt() if s.first_seen is not None)
118
+ return min(seen, default=None)
119
+
120
+ @property
121
+ def last_seen(self) -> Optional[str]:
122
+ seen = (s.last_seen for s in self._iter_stmt() if s.last_seen is not None)
123
+ return max(seen, default=None)
124
+
125
+ @property
126
+ def datasets(self) -> Set[str]:
127
+ datasets: Set[str] = set()
128
+ for stmt in self._iter_stmt():
129
+ datasets.add(stmt.dataset)
130
+ return datasets
131
+
132
+ @property
133
+ def referents(self) -> Set[str]:
134
+ referents: Set[str] = set(self.extra_referents)
135
+ for stmt in self._iter_stmt():
136
+ if stmt.entity_id is not None and stmt.entity_id != self.id:
137
+ referents.add(stmt.entity_id)
138
+ return referents
139
+
140
+ @property
141
+ def key_prefix(self) -> Optional[str]:
142
+ return self.dataset.name
143
+
144
+ @key_prefix.setter
145
+ def key_prefix(self, dataset: Optional[str]) -> None:
146
+ raise NotImplementedError()
147
+
148
+ def add_statement(self, stmt: Statement) -> None:
149
+ schema = self.schema
150
+ if schema.name != stmt.schema and not schema.is_a(stmt.schema):
151
+ try:
152
+ self.schema = schema.model.common_schema(schema, stmt.schema)
153
+ except InvalidData as exc:
154
+ raise InvalidData(f"{self.id}: {exc}") from exc
155
+
156
+ if stmt.prop == BASE_ID:
157
+ if stmt.first_seen is not None:
158
+ # The last_change attribute describes the latest checksum change
159
+ # of any emitted component of the entity, which is stored in the BASE
160
+ # field.
161
+ if self.last_change is None:
162
+ self.last_change = stmt.first_seen
163
+ else:
164
+ self.last_change = max(self.last_change, stmt.first_seen)
165
+ else:
166
+ if stmt.prop not in self._statements:
167
+ self._statements[stmt.prop] = set()
168
+ self._statements[stmt.prop].add(stmt)
169
+
170
+ def get(self, prop: P, quiet: bool = False) -> List[str]:
171
+ prop_name = self._prop_name(prop, quiet=quiet)
172
+ if prop_name is None or prop_name not in self._statements:
173
+ return []
174
+ return list({s.value for s in self._statements[prop_name]})
175
+
176
+ def get_statements(self, prop: P, quiet: bool = False) -> List[Statement]:
177
+ prop_name = self._prop_name(prop, quiet=quiet)
178
+ if prop_name is None or prop_name not in self._statements:
179
+ return []
180
+ return list(self._statements[prop_name])
181
+
182
+ def set(
183
+ self,
184
+ prop: P,
185
+ values: Values,
186
+ cleaned: bool = False,
187
+ quiet: bool = False,
188
+ fuzzy: bool = False,
189
+ format: Optional[str] = None,
190
+ lang: Optional[str] = None,
191
+ original_value: Optional[str] = None,
192
+ origin: Optional[str] = None,
193
+ ) -> None:
194
+ prop_name = self._prop_name(prop, quiet=quiet)
195
+ if prop_name is None:
196
+ return
197
+ self._statements.pop(prop_name, None)
198
+ return self.add(
199
+ prop,
200
+ values,
201
+ cleaned=cleaned,
202
+ quiet=quiet,
203
+ fuzzy=fuzzy,
204
+ format=format,
205
+ lang=lang,
206
+ original_value=original_value,
207
+ origin=origin,
208
+ )
209
+
210
+ def add(
211
+ self,
212
+ prop: P,
213
+ values: Values,
214
+ cleaned: bool = False,
215
+ quiet: bool = False,
216
+ fuzzy: bool = False,
217
+ format: Optional[str] = None,
218
+ lang: Optional[str] = None,
219
+ original_value: Optional[str] = None,
220
+ origin: Optional[str] = None,
221
+ ) -> None:
222
+ prop_name = self._prop_name(prop, quiet=quiet)
223
+ if prop_name is None:
224
+ return None
225
+ prop = self.schema.properties[prop_name]
226
+ for value in string_list(values, sanitize=not cleaned):
227
+ self.unsafe_add(
228
+ prop,
229
+ value,
230
+ cleaned=cleaned,
231
+ fuzzy=fuzzy,
232
+ format=format,
233
+ quiet=quiet,
234
+ lang=lang,
235
+ original_value=original_value,
236
+ origin=origin,
237
+ )
238
+ return None
239
+
240
+ def unsafe_add(
241
+ self,
242
+ prop: Property,
243
+ value: Optional[str],
244
+ cleaned: bool = False,
245
+ fuzzy: bool = False,
246
+ format: Optional[str] = None,
247
+ quiet: bool = False,
248
+ schema: Optional[str] = None,
249
+ dataset: Optional[str] = None,
250
+ seen: Optional[str] = None,
251
+ lang: Optional[str] = None,
252
+ original_value: Optional[str] = None,
253
+ origin: Optional[str] = None,
254
+ ) -> Optional[str]:
255
+ """Add a statement to the entity, possibly the value."""
256
+ if value is None or len(value) == 0:
257
+ return None
258
+
259
+ # Don't allow setting the reverse properties:
260
+ if prop.stub:
261
+ if quiet:
262
+ return None
263
+ msg = gettext("Stub property (%s): %s")
264
+ raise InvalidData(msg % (self.schema, prop))
265
+
266
+ if lang is not None:
267
+ lang = registry.language.clean_text(lang)
268
+
269
+ clean: Optional[str] = value
270
+ if not cleaned:
271
+ clean = prop.type.clean_text(value, proxy=self, fuzzy=fuzzy, format=format)
272
+
273
+ if clean is None:
274
+ return None
275
+
276
+ if original_value is None and clean != value:
277
+ original_value = value
278
+
279
+ if self.id is None:
280
+ raise InvalidData("Cannot add statement to entity without ID!")
281
+ stmt = Statement(
282
+ entity_id=self.id,
283
+ prop=prop.name,
284
+ schema=schema or self.schema.name,
285
+ value=clean,
286
+ dataset=dataset or self.dataset.name,
287
+ lang=lang,
288
+ original_value=original_value,
289
+ first_seen=seen,
290
+ origin=origin,
291
+ )
292
+ self.add_statement(stmt)
293
+ return clean
294
+
295
+ def pop(self, prop: P, quiet: bool = True) -> List[str]:
296
+ prop_name = self._prop_name(prop, quiet=quiet)
297
+ if prop_name is None or prop_name not in self._statements:
298
+ return []
299
+ return list({s.value for s in self._statements.pop(prop_name, [])})
300
+
301
+ def remove(self, prop: P, value: str, quiet: bool = True) -> None:
302
+ prop_name = self._prop_name(prop, quiet=quiet)
303
+ if prop_name is not None and prop_name in self._properties:
304
+ stmts = {s for s in self._statements[prop_name] if s.value != value}
305
+ self._statements[prop_name] = stmts
306
+
307
+ def itervalues(self) -> Generator[Tuple[Property, str], None, None]:
308
+ for name, statements in self._statements.items():
309
+ prop = self.schema.properties[name]
310
+ for value in set((s.value for s in statements)):
311
+ yield (prop, value)
312
+
313
+ def get_type_values(
314
+ self, type_: PropertyType, matchable: bool = False
315
+ ) -> List[str]:
316
+ combined: Set[str] = set()
317
+ for stmt in self.get_type_statements(type_, matchable=matchable):
318
+ combined.add(stmt.value)
319
+ return list(combined)
320
+
321
+ def get_type_statements(
322
+ self, type_: PropertyType, matchable: bool = False
323
+ ) -> List[Statement]:
324
+ combined = []
325
+ for prop_name, statements in self._statements.items():
326
+ prop = self.schema.properties[prop_name]
327
+ if matchable and not prop.matchable:
328
+ continue
329
+ if prop.type == type_:
330
+ for statement in statements:
331
+ combined.append(statement)
332
+ return combined
333
+
334
+ @property
335
+ def properties(self) -> Dict[str, List[str]]:
336
+ return {p: list({s.value for s in vs}) for p, vs in self._statements.items()}
337
+
338
+ def iterprops(self) -> List[Property]:
339
+ return [self.schema.properties[p] for p in self._statements.keys()]
340
+
341
+ def clone(self: SE) -> SE:
342
+ data = {"schema": self.schema.name, "id": self.id}
343
+ cloned = type(self)(self.dataset, data)
344
+ for stmt in self._iter_stmt():
345
+ cloned.add_statement(stmt)
346
+ return cloned
347
+
348
+ def merge(self: SE, other: EntityProxy) -> SE:
349
+ try:
350
+ self.schema = self.schema.model.common_schema(self.schema, other.schema)
351
+ except InvalidData as e:
352
+ msg = "Cannot merge entities with id %s: %s"
353
+ raise InvalidData(msg % (self.id, e))
354
+
355
+ if not isinstance(other, StatementEntity):
356
+ for prop, values in other._properties.items():
357
+ self.add(prop, values, cleaned=True, quiet=True)
358
+ return self
359
+ for stmt in other._iter_stmt():
360
+ if self.id is not None:
361
+ stmt.canonical_id = self.id
362
+ self.add_statement(stmt)
363
+ self.extra_referents.update(other.extra_referents)
364
+ return self
365
+
366
+ def to_dict(self) -> Dict[str, Any]:
367
+ data: Dict[str, Any] = {
368
+ "id": self.id,
369
+ "caption": self.caption,
370
+ "schema": self.schema.name,
371
+ "properties": self.properties,
372
+ "referents": list(self.referents),
373
+ "datasets": list(self.datasets),
374
+ }
375
+ if self.first_seen is not None:
376
+ data["first_seen"] = self.first_seen
377
+ if self.last_seen is not None:
378
+ data["last_seen"] = self.last_seen
379
+ if self.last_change is not None:
380
+ data["last_change"] = self.last_change
381
+ return data
382
+
383
+ def to_statement_dict(self) -> Dict[str, Any]:
384
+ """Return a dictionary representation of the entity's statements."""
385
+ data: Dict[str, Any] = {
386
+ "id": self.id,
387
+ "caption": self.caption,
388
+ "schema": self.schema.name,
389
+ "statements": [stmt.to_dict() for stmt in self.statements],
390
+ "referents": list(self.referents),
391
+ "datasets": list(self.datasets),
392
+ }
393
+ if self.first_seen is not None:
394
+ data["first_seen"] = self.first_seen
395
+ if self.last_seen is not None:
396
+ data["last_seen"] = self.last_seen
397
+ if self.last_change is not None:
398
+ data["last_change"] = self.last_change
399
+ return data
400
+
401
+ def __len__(self) -> int:
402
+ return len(list(self._iter_stmt())) + 1
403
+
404
+ @classmethod
405
+ def from_dict(
406
+ cls: Type[SE],
407
+ data: Dict[str, Any],
408
+ cleaned: bool = True,
409
+ default_dataset: Optional[Dataset] = None,
410
+ ) -> SE:
411
+ # Exists only for backwards compatibility.
412
+ dataset = default_dataset or DefaultDataset
413
+ return cls(dataset, data, cleaned=cleaned)
414
+
415
+ @classmethod
416
+ def from_data(
417
+ cls: Type[SE],
418
+ dataset: Dataset,
419
+ data: Dict[str, Any],
420
+ cleaned: bool = True,
421
+ ) -> SE:
422
+ return cls(dataset, data, cleaned=cleaned)
423
+
424
+ @classmethod
425
+ def from_statements(
426
+ cls: Type[SE],
427
+ dataset: Dataset,
428
+ statements: Iterable[Statement],
429
+ ) -> SE:
430
+ obj: Optional[SE] = None
431
+ for stmt in statements:
432
+ if obj is None:
433
+ data = {"schema": stmt.schema, "id": stmt.canonical_id}
434
+ obj = cls(dataset, data)
435
+ obj.add_statement(stmt)
436
+ if obj is None:
437
+ raise ValueError("No statements given!")
438
+ return obj
@@ -0,0 +1,251 @@
1
+ import csv
2
+ import click
3
+ import orjson
4
+ import logging
5
+ from io import TextIOWrapper
6
+ from pathlib import Path
7
+ from types import TracebackType
8
+ from typing import cast
9
+ from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
10
+ from rigour.boolean import text_bool
11
+
12
+ from followthemoney.statement.statement import Statement, StatementDict
13
+ from followthemoney.statement.util import unpack_prop
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+ JSON = "json"
18
+ CSV = "csv"
19
+ PACK = "pack"
20
+ FORMATS = [JSON, CSV, PACK]
21
+
22
+ CSV_BATCH = 5000
23
+ CSV_COLUMNS = [
24
+ "canonical_id",
25
+ "entity_id",
26
+ "prop",
27
+ "prop_type",
28
+ "schema",
29
+ "value",
30
+ "dataset",
31
+ "origin",
32
+ "lang",
33
+ "original_value",
34
+ "external",
35
+ "first_seen",
36
+ "last_seen",
37
+ "id",
38
+ ]
39
+ LEGACY_PACK_COLUMNS = [
40
+ "entity_id",
41
+ "prop",
42
+ "value",
43
+ "dataset",
44
+ "lang",
45
+ "original_value",
46
+ "target",
47
+ "external",
48
+ "first_seen",
49
+ "last_seen",
50
+ ]
51
+
52
+
53
+ def read_json_statements(
54
+ fh: BinaryIO,
55
+ max_line: int = 40 * 1024 * 1024,
56
+ ) -> Generator[Statement, None, None]:
57
+ while line := fh.readline(max_line):
58
+ data = orjson.loads(line)
59
+ yield Statement.from_dict(data)
60
+
61
+
62
+ def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
63
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
64
+ for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
65
+ data = cast(StatementDict, row)
66
+ data["external"] = text_bool(row.get("external")) or False
67
+ if row.get("lang") == "":
68
+ data["lang"] = None
69
+ if row.get("original_value") == "":
70
+ data["original_value"] = None
71
+ yield Statement.from_dict(data)
72
+
73
+
74
+ def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
75
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
76
+ yield from read_pack_statements_decoded(wrapped)
77
+
78
+
79
+ def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]:
80
+ headers: Optional[List[str]] = None
81
+ for row in csv.reader(fh, dialect=csv.unix_dialect):
82
+ if headers is None:
83
+ if "entity_id" in row and "prop" in row:
84
+ headers = row
85
+ else:
86
+ # This is a legacy pack file, with no headers.
87
+ headers = LEGACY_PACK_COLUMNS
88
+ continue
89
+ data = dict(zip(headers, row))
90
+ try:
91
+ schema, _, prop = unpack_prop(data["prop"])
92
+ except TypeError:
93
+ log.error("Invalid property in pack statement: %s" % data["prop"])
94
+ continue
95
+ yield Statement(
96
+ entity_id=data["entity_id"],
97
+ prop=prop,
98
+ schema=schema,
99
+ value=data["value"],
100
+ dataset=data["dataset"],
101
+ lang=data["lang"] or None,
102
+ original_value=data["original_value"] or None,
103
+ origin=data.get("origin"),
104
+ first_seen=data["first_seen"],
105
+ external=data["external"] == "t",
106
+ canonical_id=data["entity_id"],
107
+ last_seen=data["last_seen"],
108
+ id=data.get("id"),
109
+ )
110
+
111
+
112
+ def read_statements(fh: BinaryIO, format: str) -> Generator[Statement, None, None]:
113
+ if format == CSV:
114
+ yield from read_csv_statements(fh)
115
+ elif format == PACK:
116
+ yield from read_pack_statements(fh)
117
+ else:
118
+ yield from read_json_statements(fh)
119
+
120
+
121
+ def read_path_statements(path: Path, format: str) -> Generator[Statement, None, None]:
122
+ if str(path) == "-":
123
+ fh = click.get_binary_stream("stdin")
124
+ yield from read_statements(fh, format=format)
125
+ return
126
+ with open(path, "rb") as fh:
127
+ yield from read_statements(fh, format=format)
128
+
129
+
130
+ def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
131
+ if format == CSV:
132
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
133
+ return CSVStatementWriter(wrapped)
134
+ elif format == PACK:
135
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
136
+ return PackStatementWriter(wrapped)
137
+ elif format == JSON:
138
+ return JSONStatementWriter(fh)
139
+ raise RuntimeError("Unknown statement format: %s" % format)
140
+
141
+
142
+ def write_statements(
143
+ fh: BinaryIO, format: str, statements: Iterable[Statement]
144
+ ) -> None:
145
+ writer = get_statement_writer(fh, format)
146
+ for stmt in statements:
147
+ writer.write(stmt)
148
+ writer.close()
149
+
150
+
151
+ class StatementWriter(object):
152
+ def write(self, stmt: Statement) -> None:
153
+ raise NotImplementedError()
154
+
155
+ def close(self) -> None:
156
+ raise NotImplementedError()
157
+
158
+ def __enter__(self) -> "StatementWriter":
159
+ return self
160
+
161
+ def __exit__(
162
+ self,
163
+ type: Optional[Type[BaseException]],
164
+ value: Optional[BaseException],
165
+ traceback: Optional[TracebackType],
166
+ ) -> None:
167
+ self.close()
168
+
169
+
170
+ class JSONStatementWriter(StatementWriter):
171
+ def __init__(self, fh: BinaryIO) -> None:
172
+ self.fh = fh
173
+
174
+ def write(self, stmt: Statement) -> None:
175
+ data = stmt.to_dict()
176
+ out = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
177
+ self.fh.write(out)
178
+
179
+ def close(self) -> None:
180
+ self.fh.close()
181
+
182
+
183
+ class CSVStatementWriter(StatementWriter):
184
+ def __init__(self, fh: TextIO) -> None:
185
+ self.fh = fh
186
+ self.writer = csv.writer(self.fh, dialect=csv.unix_dialect)
187
+ self.writer.writerow(CSV_COLUMNS)
188
+ self._batch: List[List[Optional[str]]] = []
189
+
190
+ def write(self, stmt: Statement) -> None:
191
+ row = stmt.to_csv_row()
192
+ self._batch.append([row[c] for c in CSV_COLUMNS])
193
+ if len(self._batch) >= CSV_BATCH:
194
+ self.writer.writerows(self._batch)
195
+ self._batch.clear()
196
+
197
+ def close(self) -> None:
198
+ if len(self._batch) > 0:
199
+ self.writer.writerows(self._batch)
200
+ self.fh.close()
201
+
202
+
203
+ class PackStatementWriter(StatementWriter):
204
+ def __init__(self, fh: TextIO) -> None:
205
+ self.fh = fh
206
+ self.writer = csv.writer(
207
+ self.fh,
208
+ dialect=csv.unix_dialect,
209
+ quoting=csv.QUOTE_MINIMAL,
210
+ )
211
+ columns = [
212
+ "entity_id",
213
+ "prop",
214
+ "value",
215
+ "dataset",
216
+ "lang",
217
+ "original_value",
218
+ "origin",
219
+ "external",
220
+ "first_seen",
221
+ "last_seen",
222
+ "id",
223
+ ]
224
+ self.writer.writerow(columns)
225
+ self._batch: List[List[Optional[str]]] = []
226
+
227
+ def write(self, stmt: Statement) -> None:
228
+ # HACK: This is very similar to the CSV writer, but at the very inner
229
+ # loop of the application, so we're duplicating code here.
230
+ row = [
231
+ stmt.entity_id,
232
+ f"{stmt.schema}:{stmt.prop}",
233
+ stmt.value,
234
+ stmt.dataset,
235
+ stmt.lang,
236
+ stmt.original_value,
237
+ stmt.origin,
238
+ "t" if stmt.external else None,
239
+ stmt.first_seen,
240
+ stmt.last_seen,
241
+ stmt.id,
242
+ ]
243
+ self._batch.append(row)
244
+ if len(self._batch) >= CSV_BATCH:
245
+ self.writer.writerows(self._batch)
246
+ self._batch.clear()
247
+
248
+ def close(self) -> None:
249
+ if len(self._batch) > 0:
250
+ self.writer.writerows(self._batch)
251
+ self.fh.close()