followthemoney 3.8.4__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/__init__.py +3 -12
- followthemoney/cli/aggregate.py +1 -1
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/mapping.py +6 -4
- followthemoney/cli/sieve.py +1 -1
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +137 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +58 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/common.py +3 -3
- followthemoney/export/csv.py +10 -12
- followthemoney/export/neo4j.py +1 -1
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +6 -4
- followthemoney/mapping/csv.py +6 -18
- followthemoney/mapping/sql.py +3 -4
- followthemoney/model.py +36 -9
- followthemoney/namespace.py +3 -1
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +44 -65
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +27 -39
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +437 -0
- followthemoney/statement/serialize.py +245 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/METADATA +38 -34
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/RECORD +78 -69
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/entry_points.txt +1 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/licenses/LICENSE +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import click
|
|
3
|
+
import orjson
|
|
4
|
+
from io import TextIOWrapper
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import TracebackType
|
|
7
|
+
from typing import cast
|
|
8
|
+
from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
|
|
9
|
+
from rigour.boolean import text_bool
|
|
10
|
+
|
|
11
|
+
from followthemoney.statement.statement import Statement, StatementDict
|
|
12
|
+
from followthemoney.statement.util import unpack_prop
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
JSON = "json"
|
|
16
|
+
CSV = "csv"
|
|
17
|
+
PACK = "pack"
|
|
18
|
+
FORMATS = [JSON, CSV, PACK]
|
|
19
|
+
|
|
20
|
+
CSV_BATCH = 5000
|
|
21
|
+
CSV_COLUMNS = [
|
|
22
|
+
"canonical_id",
|
|
23
|
+
"entity_id",
|
|
24
|
+
"prop",
|
|
25
|
+
"prop_type",
|
|
26
|
+
"schema",
|
|
27
|
+
"value",
|
|
28
|
+
"dataset",
|
|
29
|
+
"origin",
|
|
30
|
+
"lang",
|
|
31
|
+
"original_value",
|
|
32
|
+
"external",
|
|
33
|
+
"first_seen",
|
|
34
|
+
"last_seen",
|
|
35
|
+
"id",
|
|
36
|
+
]
|
|
37
|
+
LEGACY_PACK_COLUMNS = [
|
|
38
|
+
"entity_id",
|
|
39
|
+
"prop",
|
|
40
|
+
"value",
|
|
41
|
+
"dataset",
|
|
42
|
+
"lang",
|
|
43
|
+
"original_value",
|
|
44
|
+
"target",
|
|
45
|
+
"external",
|
|
46
|
+
"first_seen",
|
|
47
|
+
"last_seen",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def read_json_statements(
|
|
52
|
+
fh: BinaryIO,
|
|
53
|
+
max_line: int = 40 * 1024 * 1024,
|
|
54
|
+
) -> Generator[Statement, None, None]:
|
|
55
|
+
while line := fh.readline(max_line):
|
|
56
|
+
data = orjson.loads(line)
|
|
57
|
+
yield Statement.from_dict(data)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
61
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
62
|
+
for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
|
|
63
|
+
data = cast(StatementDict, row)
|
|
64
|
+
data["external"] = text_bool(row.get("external")) or False
|
|
65
|
+
if row.get("lang") == "":
|
|
66
|
+
data["lang"] = None
|
|
67
|
+
if row.get("original_value") == "":
|
|
68
|
+
data["original_value"] = None
|
|
69
|
+
yield Statement.from_dict(data)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
73
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
74
|
+
yield from read_pack_statements_decoded(wrapped)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]:
|
|
78
|
+
headers: Optional[List[str]] = None
|
|
79
|
+
for row in csv.reader(fh, dialect=csv.unix_dialect):
|
|
80
|
+
if headers is None:
|
|
81
|
+
if "entity_id" in row and "prop" in row:
|
|
82
|
+
headers = row
|
|
83
|
+
else:
|
|
84
|
+
# This is a legacy pack file, with no headers.
|
|
85
|
+
headers = LEGACY_PACK_COLUMNS
|
|
86
|
+
continue
|
|
87
|
+
data = dict(zip(headers, row))
|
|
88
|
+
schema, _, prop = unpack_prop(data["prop"])
|
|
89
|
+
yield Statement(
|
|
90
|
+
entity_id=data["entity_id"],
|
|
91
|
+
prop=prop,
|
|
92
|
+
schema=schema,
|
|
93
|
+
value=data["value"],
|
|
94
|
+
dataset=data["dataset"],
|
|
95
|
+
lang=data.get("lang") or None,
|
|
96
|
+
original_value=data.get("original_value") or None,
|
|
97
|
+
origin=data.get("origin"),
|
|
98
|
+
first_seen=data["first_seen"],
|
|
99
|
+
external=data["external"] == "t",
|
|
100
|
+
canonical_id=data["entity_id"],
|
|
101
|
+
last_seen=data["last_seen"],
|
|
102
|
+
id=data.get("id"),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def read_statements(fh: BinaryIO, format: str) -> Generator[Statement, None, None]:
|
|
107
|
+
if format == CSV:
|
|
108
|
+
yield from read_csv_statements(fh)
|
|
109
|
+
elif format == PACK:
|
|
110
|
+
yield from read_pack_statements(fh)
|
|
111
|
+
else:
|
|
112
|
+
yield from read_json_statements(fh)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def read_path_statements(path: Path, format: str) -> Generator[Statement, None, None]:
|
|
116
|
+
if str(path) == "-":
|
|
117
|
+
fh = click.get_binary_stream("stdin")
|
|
118
|
+
yield from read_statements(fh, format=format)
|
|
119
|
+
return
|
|
120
|
+
with open(path, "rb") as fh:
|
|
121
|
+
yield from read_statements(fh, format=format)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
|
|
125
|
+
if format == CSV:
|
|
126
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
127
|
+
return CSVStatementWriter(wrapped)
|
|
128
|
+
elif format == PACK:
|
|
129
|
+
wrapped = TextIOWrapper(fh, encoding="utf-8")
|
|
130
|
+
return PackStatementWriter(wrapped)
|
|
131
|
+
elif format == JSON:
|
|
132
|
+
return JSONStatementWriter(fh)
|
|
133
|
+
raise RuntimeError("Unknown statement format: %s" % format)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def write_statements(
|
|
137
|
+
fh: BinaryIO, format: str, statements: Iterable[Statement]
|
|
138
|
+
) -> None:
|
|
139
|
+
writer = get_statement_writer(fh, format)
|
|
140
|
+
for stmt in statements:
|
|
141
|
+
writer.write(stmt)
|
|
142
|
+
writer.close()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class StatementWriter(object):
|
|
146
|
+
def write(self, stmt: Statement) -> None:
|
|
147
|
+
raise NotImplementedError()
|
|
148
|
+
|
|
149
|
+
def close(self) -> None:
|
|
150
|
+
raise NotImplementedError()
|
|
151
|
+
|
|
152
|
+
def __enter__(self) -> "StatementWriter":
|
|
153
|
+
return self
|
|
154
|
+
|
|
155
|
+
def __exit__(
|
|
156
|
+
self,
|
|
157
|
+
type: Optional[Type[BaseException]],
|
|
158
|
+
value: Optional[BaseException],
|
|
159
|
+
traceback: Optional[TracebackType],
|
|
160
|
+
) -> None:
|
|
161
|
+
self.close()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class JSONStatementWriter(StatementWriter):
|
|
165
|
+
def __init__(self, fh: BinaryIO) -> None:
|
|
166
|
+
self.fh = fh
|
|
167
|
+
|
|
168
|
+
def write(self, stmt: Statement) -> None:
|
|
169
|
+
data = stmt.to_dict()
|
|
170
|
+
out = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
|
|
171
|
+
self.fh.write(out)
|
|
172
|
+
|
|
173
|
+
def close(self) -> None:
|
|
174
|
+
self.fh.close()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class CSVStatementWriter(StatementWriter):
|
|
178
|
+
def __init__(self, fh: TextIO) -> None:
|
|
179
|
+
self.fh = fh
|
|
180
|
+
self.writer = csv.writer(self.fh, dialect=csv.unix_dialect)
|
|
181
|
+
self.writer.writerow(CSV_COLUMNS)
|
|
182
|
+
self._batch: List[List[Optional[str]]] = []
|
|
183
|
+
|
|
184
|
+
def write(self, stmt: Statement) -> None:
|
|
185
|
+
row = stmt.to_csv_row()
|
|
186
|
+
self._batch.append([row[c] for c in CSV_COLUMNS])
|
|
187
|
+
if len(self._batch) >= CSV_BATCH:
|
|
188
|
+
self.writer.writerows(self._batch)
|
|
189
|
+
self._batch.clear()
|
|
190
|
+
|
|
191
|
+
def close(self) -> None:
|
|
192
|
+
if len(self._batch) > 0:
|
|
193
|
+
self.writer.writerows(self._batch)
|
|
194
|
+
self.fh.close()
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class PackStatementWriter(StatementWriter):
|
|
198
|
+
def __init__(self, fh: TextIO) -> None:
|
|
199
|
+
self.fh = fh
|
|
200
|
+
self.writer = csv.writer(
|
|
201
|
+
self.fh,
|
|
202
|
+
dialect=csv.unix_dialect,
|
|
203
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
204
|
+
)
|
|
205
|
+
columns = [
|
|
206
|
+
"entity_id",
|
|
207
|
+
"prop",
|
|
208
|
+
"value",
|
|
209
|
+
"dataset",
|
|
210
|
+
"lang",
|
|
211
|
+
"original_value",
|
|
212
|
+
"origin",
|
|
213
|
+
"external",
|
|
214
|
+
"first_seen",
|
|
215
|
+
"last_seen",
|
|
216
|
+
"id",
|
|
217
|
+
]
|
|
218
|
+
self.writer.writerow(columns)
|
|
219
|
+
self._batch: List[List[Optional[str]]] = []
|
|
220
|
+
|
|
221
|
+
def write(self, stmt: Statement) -> None:
|
|
222
|
+
# HACK: This is very similar to the CSV writer, but at the very inner
|
|
223
|
+
# loop of the application, so we're duplicating code here.
|
|
224
|
+
row = [
|
|
225
|
+
stmt.entity_id,
|
|
226
|
+
f"{stmt.schema}:{stmt.prop}",
|
|
227
|
+
stmt.value,
|
|
228
|
+
stmt.dataset,
|
|
229
|
+
stmt.lang,
|
|
230
|
+
stmt.original_value,
|
|
231
|
+
stmt.origin,
|
|
232
|
+
"t" if stmt.external else None,
|
|
233
|
+
stmt.first_seen,
|
|
234
|
+
stmt.last_seen,
|
|
235
|
+
stmt.id,
|
|
236
|
+
]
|
|
237
|
+
self._batch.append(row)
|
|
238
|
+
if len(self._batch) >= CSV_BATCH:
|
|
239
|
+
self.writer.writerows(self._batch)
|
|
240
|
+
self._batch.clear()
|
|
241
|
+
|
|
242
|
+
def close(self) -> None:
|
|
243
|
+
if len(self._batch) > 0:
|
|
244
|
+
self.writer.writerows(self._batch)
|
|
245
|
+
self.fh.close()
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import warnings
|
|
3
|
+
from sqlalchemy.engine import Row
|
|
4
|
+
from typing import cast
|
|
5
|
+
from typing import Any, Dict, Generator, Optional
|
|
6
|
+
from typing_extensions import TypedDict, Self
|
|
7
|
+
from rigour.time import datetime_iso, iso_datetime
|
|
8
|
+
from rigour.boolean import bool_text
|
|
9
|
+
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
11
|
+
from followthemoney.statement.util import get_prop_type, BASE_ID
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StatementDict(TypedDict):
|
|
15
|
+
id: Optional[str]
|
|
16
|
+
entity_id: str
|
|
17
|
+
canonical_id: str
|
|
18
|
+
prop: str
|
|
19
|
+
schema: str
|
|
20
|
+
value: str
|
|
21
|
+
dataset: str
|
|
22
|
+
lang: Optional[str]
|
|
23
|
+
original_value: Optional[str]
|
|
24
|
+
external: bool
|
|
25
|
+
first_seen: Optional[str]
|
|
26
|
+
last_seen: Optional[str]
|
|
27
|
+
origin: Optional[str]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Statement(object):
|
|
31
|
+
"""A single statement about a property relevant to an entity.
|
|
32
|
+
|
|
33
|
+
For example, this could be used to say: "In dataset A, entity X has the
|
|
34
|
+
property `name` set to 'John Smith'. I first observed this at K, and last
|
|
35
|
+
saw it at L."
|
|
36
|
+
|
|
37
|
+
Null property values are not supported. This might need to change if we
|
|
38
|
+
want to support making property-less entities.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
BASE = BASE_ID
|
|
42
|
+
|
|
43
|
+
__slots__ = [
|
|
44
|
+
"id",
|
|
45
|
+
"entity_id",
|
|
46
|
+
"canonical_id",
|
|
47
|
+
"prop",
|
|
48
|
+
"schema",
|
|
49
|
+
"value",
|
|
50
|
+
"dataset",
|
|
51
|
+
"lang",
|
|
52
|
+
"original_value",
|
|
53
|
+
"external",
|
|
54
|
+
"first_seen",
|
|
55
|
+
"last_seen",
|
|
56
|
+
"origin",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
entity_id: str,
|
|
62
|
+
prop: str,
|
|
63
|
+
schema: str,
|
|
64
|
+
value: str,
|
|
65
|
+
dataset: str,
|
|
66
|
+
lang: Optional[str] = None,
|
|
67
|
+
original_value: Optional[str] = None,
|
|
68
|
+
first_seen: Optional[str] = None,
|
|
69
|
+
external: bool = False,
|
|
70
|
+
id: Optional[str] = None,
|
|
71
|
+
canonical_id: Optional[str] = None,
|
|
72
|
+
last_seen: Optional[str] = None,
|
|
73
|
+
origin: Optional[str] = None,
|
|
74
|
+
):
|
|
75
|
+
self.entity_id = entity_id
|
|
76
|
+
self.canonical_id = canonical_id or entity_id
|
|
77
|
+
self.prop = prop
|
|
78
|
+
self.schema = schema
|
|
79
|
+
self.value = value
|
|
80
|
+
self.dataset = dataset
|
|
81
|
+
self.lang = lang
|
|
82
|
+
self.original_value = original_value
|
|
83
|
+
self.first_seen = first_seen
|
|
84
|
+
self.last_seen = last_seen or first_seen
|
|
85
|
+
self.external = external
|
|
86
|
+
self.origin = origin
|
|
87
|
+
if id is None:
|
|
88
|
+
id = self.generate_key()
|
|
89
|
+
self.id = id
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def prop_type(self) -> str:
|
|
93
|
+
"""The type of the property, e.g. 'string', 'number', 'url'."""
|
|
94
|
+
return get_prop_type(self.schema, self.prop)
|
|
95
|
+
|
|
96
|
+
def to_dict(self) -> StatementDict:
|
|
97
|
+
return {
|
|
98
|
+
"canonical_id": self.canonical_id,
|
|
99
|
+
"entity_id": self.entity_id,
|
|
100
|
+
"prop": self.prop,
|
|
101
|
+
"schema": self.schema,
|
|
102
|
+
"value": self.value,
|
|
103
|
+
"dataset": self.dataset,
|
|
104
|
+
"lang": self.lang,
|
|
105
|
+
"original_value": self.original_value,
|
|
106
|
+
"first_seen": self.first_seen,
|
|
107
|
+
"last_seen": self.last_seen,
|
|
108
|
+
"external": self.external,
|
|
109
|
+
"origin": self.origin,
|
|
110
|
+
"id": self.id,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
def to_csv_row(self) -> Dict[str, Optional[str]]:
|
|
114
|
+
data = cast(Dict[str, Optional[str]], self.to_dict())
|
|
115
|
+
data["external"] = bool_text(self.external)
|
|
116
|
+
data["prop_type"] = get_prop_type(self.schema, self.prop)
|
|
117
|
+
return data
|
|
118
|
+
|
|
119
|
+
def to_db_row(self) -> Dict[str, Any]:
|
|
120
|
+
data = cast(Dict[str, Any], self.to_dict())
|
|
121
|
+
data["first_seen"] = iso_datetime(self.first_seen)
|
|
122
|
+
data["last_seen"] = iso_datetime(self.last_seen)
|
|
123
|
+
data["prop_type"] = get_prop_type(self.schema, self.prop)
|
|
124
|
+
return data
|
|
125
|
+
|
|
126
|
+
def __hash__(self) -> int:
|
|
127
|
+
if self.id is None:
|
|
128
|
+
warnings.warn(
|
|
129
|
+
"Hashing a statement without an ID results in undefined behaviour",
|
|
130
|
+
RuntimeWarning,
|
|
131
|
+
)
|
|
132
|
+
return hash(self.id)
|
|
133
|
+
|
|
134
|
+
def __repr__(self) -> str:
|
|
135
|
+
return "<Statement(%r, %r, %r)>" % (self.entity_id, self.prop, self.value)
|
|
136
|
+
|
|
137
|
+
def __eq__(self, other: Any) -> bool:
|
|
138
|
+
return not self.id != other.id
|
|
139
|
+
|
|
140
|
+
def __lt__(self, other: Any) -> bool:
|
|
141
|
+
self_key = (self.prop != BASE_ID, self.id or "")
|
|
142
|
+
other_key = (other.prop != BASE_ID, other.id or "")
|
|
143
|
+
return self_key < other_key
|
|
144
|
+
|
|
145
|
+
def clone(self: Self) -> "Statement":
|
|
146
|
+
"""Make a deep copy of the given statement."""
|
|
147
|
+
return Statement.from_dict(self.to_dict())
|
|
148
|
+
|
|
149
|
+
def generate_key(self) -> Optional[str]:
|
|
150
|
+
return self.make_key(
|
|
151
|
+
self.dataset,
|
|
152
|
+
self.entity_id,
|
|
153
|
+
self.prop,
|
|
154
|
+
self.value,
|
|
155
|
+
self.external,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def make_key(
|
|
160
|
+
cls,
|
|
161
|
+
dataset: str,
|
|
162
|
+
entity_id: str,
|
|
163
|
+
prop: str,
|
|
164
|
+
value: str,
|
|
165
|
+
external: Optional[bool],
|
|
166
|
+
) -> Optional[str]:
|
|
167
|
+
"""Hash the key properties of a statement record to make a unique ID."""
|
|
168
|
+
if prop is None or value is None:
|
|
169
|
+
return None
|
|
170
|
+
key = f"{dataset}.{entity_id}.{prop}.{value}"
|
|
171
|
+
if external:
|
|
172
|
+
# We consider the external flag in key composition to avoid race conditions
|
|
173
|
+
# where a certain entity might be emitted as external while it is already
|
|
174
|
+
# linked in to the graph via another route.
|
|
175
|
+
key = f"{key}.ext"
|
|
176
|
+
return hashlib.sha1(key.encode("utf-8")).hexdigest()
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def from_dict(cls, data: StatementDict) -> "Statement":
|
|
180
|
+
return cls(
|
|
181
|
+
entity_id=data["entity_id"],
|
|
182
|
+
prop=data["prop"],
|
|
183
|
+
schema=data["schema"],
|
|
184
|
+
value=data["value"],
|
|
185
|
+
dataset=data["dataset"],
|
|
186
|
+
lang=data.get("lang", None),
|
|
187
|
+
original_value=data.get("original_value", None),
|
|
188
|
+
first_seen=data.get("first_seen", None),
|
|
189
|
+
external=data.get("external", False),
|
|
190
|
+
id=data.get("id", None),
|
|
191
|
+
canonical_id=data.get("canonical_id", None),
|
|
192
|
+
last_seen=data.get("last_seen", None),
|
|
193
|
+
origin=data.get("origin", None),
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
def from_db_row(cls, row: Row[Any]) -> "Statement":
|
|
198
|
+
return cls(
|
|
199
|
+
id=row.id,
|
|
200
|
+
canonical_id=row.canonical_id,
|
|
201
|
+
entity_id=row.entity_id,
|
|
202
|
+
prop=row.prop,
|
|
203
|
+
schema=row.schema,
|
|
204
|
+
value=row.value,
|
|
205
|
+
dataset=row.dataset,
|
|
206
|
+
lang=row.lang,
|
|
207
|
+
original_value=row.original_value,
|
|
208
|
+
first_seen=datetime_iso(row.first_seen),
|
|
209
|
+
external=row.external,
|
|
210
|
+
last_seen=datetime_iso(row.last_seen),
|
|
211
|
+
origin=row.origin,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def from_entity(
|
|
216
|
+
cls,
|
|
217
|
+
entity: "EntityProxy",
|
|
218
|
+
dataset: str,
|
|
219
|
+
first_seen: Optional[str] = None,
|
|
220
|
+
last_seen: Optional[str] = None,
|
|
221
|
+
external: bool = False,
|
|
222
|
+
origin: Optional[str] = None,
|
|
223
|
+
) -> Generator["Statement", None, None]:
|
|
224
|
+
from followthemoney.statement.entity import StatementEntity
|
|
225
|
+
|
|
226
|
+
if entity.id is None:
|
|
227
|
+
raise ValueError("Cannot create statements for entity without ID!")
|
|
228
|
+
|
|
229
|
+
# If the entity is already a StatementEntity, we return its statements directly.
|
|
230
|
+
if isinstance(entity, StatementEntity):
|
|
231
|
+
yield from entity.statements
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
yield cls(
|
|
235
|
+
entity_id=entity.id,
|
|
236
|
+
prop=BASE_ID,
|
|
237
|
+
schema=entity.schema.name,
|
|
238
|
+
value=entity.id,
|
|
239
|
+
dataset=dataset,
|
|
240
|
+
external=external,
|
|
241
|
+
first_seen=first_seen,
|
|
242
|
+
last_seen=last_seen,
|
|
243
|
+
origin=origin,
|
|
244
|
+
)
|
|
245
|
+
for prop, value in entity.itervalues():
|
|
246
|
+
yield cls(
|
|
247
|
+
entity_id=entity.id,
|
|
248
|
+
prop=prop.name,
|
|
249
|
+
schema=entity.schema.name,
|
|
250
|
+
value=value,
|
|
251
|
+
dataset=dataset,
|
|
252
|
+
external=external,
|
|
253
|
+
first_seen=first_seen,
|
|
254
|
+
last_seen=last_seen,
|
|
255
|
+
origin=origin,
|
|
256
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from functools import cache
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
|
|
5
|
+
from followthemoney.model import Model
|
|
6
|
+
|
|
7
|
+
BASE_ID = "id"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pack_prop(schema: str, prop: str) -> str:
|
|
11
|
+
return f"{schema}:{prop}"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@cache
|
|
15
|
+
def get_prop_type(schema: str, prop: str) -> str:
|
|
16
|
+
if prop == BASE_ID:
|
|
17
|
+
return BASE_ID
|
|
18
|
+
schema_obj = Model.instance().get(schema)
|
|
19
|
+
if schema_obj is None:
|
|
20
|
+
raise TypeError("Schema not found: %s" % schema)
|
|
21
|
+
prop_obj = schema_obj.get(prop)
|
|
22
|
+
if prop_obj is None:
|
|
23
|
+
raise TypeError("Property not found: %s" % prop)
|
|
24
|
+
return prop_obj.type.name
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@cache
|
|
28
|
+
def unpack_prop(id: str) -> Tuple[str, str, str]:
|
|
29
|
+
schema, prop = id.split(":", 1)
|
|
30
|
+
prop_type = get_prop_type(schema, prop)
|
|
31
|
+
return sys.intern(schema), prop_type, sys.intern(prop)
|
followthemoney/types/__init__.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from
|
|
1
|
+
from banal import ensure_list
|
|
2
|
+
from typing import Dict, Iterable, List, Set, cast
|
|
3
|
+
|
|
2
4
|
from followthemoney.types.url import UrlType
|
|
3
5
|
from followthemoney.types.name import NameType
|
|
4
6
|
from followthemoney.types.email import EmailType
|
|
@@ -11,7 +13,6 @@ from followthemoney.types.language import LanguageType
|
|
|
11
13
|
from followthemoney.types.mimetype import MimeType
|
|
12
14
|
from followthemoney.types.checksum import ChecksumType
|
|
13
15
|
from followthemoney.types.identifier import IdentifierType
|
|
14
|
-
from followthemoney.types.iban import IbanType
|
|
15
16
|
from followthemoney.types.entity import EntityType
|
|
16
17
|
from followthemoney.types.topic import TopicType
|
|
17
18
|
from followthemoney.types.gender import GenderType
|
|
@@ -22,27 +23,69 @@ from followthemoney.types.string import StringType
|
|
|
22
23
|
from followthemoney.types.number import NumberType
|
|
23
24
|
from followthemoney.types.common import PropertyType
|
|
24
25
|
|
|
26
|
+
|
|
27
|
+
class Registry(object):
|
|
28
|
+
"""This registry keeps the processing helpers for all property types in the system. The
|
|
29
|
+
registry can be used to get a type, which can itself then clean, validate or format values
|
|
30
|
+
of that type."""
|
|
31
|
+
|
|
32
|
+
url = UrlType()
|
|
33
|
+
name = NameType()
|
|
34
|
+
email = EmailType()
|
|
35
|
+
ip = IpType()
|
|
36
|
+
address = AddressType()
|
|
37
|
+
date = DateType()
|
|
38
|
+
phone = PhoneType()
|
|
39
|
+
country = CountryType()
|
|
40
|
+
language = LanguageType()
|
|
41
|
+
mimetype = MimeType()
|
|
42
|
+
checksum = ChecksumType()
|
|
43
|
+
identifier = IdentifierType()
|
|
44
|
+
entity = EntityType()
|
|
45
|
+
topic = TopicType()
|
|
46
|
+
gender = GenderType()
|
|
47
|
+
json = JsonType()
|
|
48
|
+
text = TextType()
|
|
49
|
+
html = HTMLType()
|
|
50
|
+
string = StringType()
|
|
51
|
+
number = NumberType()
|
|
52
|
+
|
|
53
|
+
def __init__(self) -> None:
|
|
54
|
+
self.matchable: Set[PropertyType] = set()
|
|
55
|
+
self.types: Set[PropertyType] = set()
|
|
56
|
+
self.groups: Dict[str, PropertyType] = {}
|
|
57
|
+
self.pivots: Set[PropertyType] = set()
|
|
58
|
+
for name in dir(self):
|
|
59
|
+
type_ = getattr(self, name)
|
|
60
|
+
if not isinstance(type_, PropertyType):
|
|
61
|
+
continue
|
|
62
|
+
assert type_.name == name
|
|
63
|
+
self.types.add(type_)
|
|
64
|
+
if type_.matchable:
|
|
65
|
+
self.matchable.add(type_)
|
|
66
|
+
if type_.pivot:
|
|
67
|
+
self.pivots.add(type_)
|
|
68
|
+
if type_.group is not None:
|
|
69
|
+
self.groups[type_.group] = type_
|
|
70
|
+
|
|
71
|
+
def get(self, name: str) -> PropertyType:
|
|
72
|
+
"""For a given property type name, get its type object. This can also
|
|
73
|
+
be used via getattr, e.g. ``registry.phone``."""
|
|
74
|
+
# Allow transparent re-checking.
|
|
75
|
+
if isinstance(name, PropertyType):
|
|
76
|
+
return name
|
|
77
|
+
return cast(PropertyType, getattr(self, name))
|
|
78
|
+
|
|
79
|
+
def get_types(self, names: Iterable[str]) -> List[PropertyType]:
|
|
80
|
+
"""Get a list of all property type objects linked to a set of names."""
|
|
81
|
+
names = ensure_list(names)
|
|
82
|
+
types = [self.get(n) for n in names]
|
|
83
|
+
return [t for t in types if t is not None]
|
|
84
|
+
|
|
85
|
+
def __getitem__(self, name: str) -> PropertyType:
|
|
86
|
+
return cast(PropertyType, getattr(self, name))
|
|
87
|
+
|
|
88
|
+
|
|
25
89
|
registry = Registry()
|
|
26
|
-
registry.add(UrlType)
|
|
27
|
-
registry.add(NameType)
|
|
28
|
-
registry.add(EmailType)
|
|
29
|
-
registry.add(IpType)
|
|
30
|
-
registry.add(AddressType)
|
|
31
|
-
registry.add(DateType)
|
|
32
|
-
registry.add(PhoneType)
|
|
33
|
-
registry.add(CountryType)
|
|
34
|
-
registry.add(LanguageType)
|
|
35
|
-
registry.add(MimeType)
|
|
36
|
-
registry.add(ChecksumType)
|
|
37
|
-
registry.add(IdentifierType)
|
|
38
|
-
registry.add(IbanType) # TODO: remove
|
|
39
|
-
registry.add(EntityType)
|
|
40
|
-
registry.add(TopicType)
|
|
41
|
-
registry.add(GenderType)
|
|
42
|
-
registry.add(JsonType)
|
|
43
|
-
registry.add(TextType)
|
|
44
|
-
registry.add(HTMLType)
|
|
45
|
-
registry.add(StringType)
|
|
46
|
-
registry.add(NumberType)
|
|
47
90
|
|
|
48
91
|
__all__ = ["PropertyType", "registry"]
|
followthemoney/types/address.py
CHANGED
|
@@ -7,7 +7,7 @@ from rigour.text.distance import levenshtein_similarity
|
|
|
7
7
|
|
|
8
8
|
from followthemoney.types.common import PropertyType
|
|
9
9
|
from followthemoney.util import defer as _
|
|
10
|
-
from followthemoney.util import dampen
|
|
10
|
+
from followthemoney.util import dampen, const
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from followthemoney.proxy import EntityProxy
|
|
@@ -21,8 +21,8 @@ class AddressType(PropertyType):
|
|
|
21
21
|
|
|
22
22
|
LINE_BREAKS = re.compile(r"(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)")
|
|
23
23
|
COMMATA = re.compile(r"(,\s?[,\.])")
|
|
24
|
-
name = "address"
|
|
25
|
-
group = "addresses"
|
|
24
|
+
name = const("address")
|
|
25
|
+
group = const("addresses")
|
|
26
26
|
label = _("Address")
|
|
27
27
|
plural = _("Addresses")
|
|
28
28
|
matchable = True
|
followthemoney/types/checksum.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
2
1
|
from followthemoney.types.common import PropertyType
|
|
3
|
-
from followthemoney.util import defer as _
|
|
2
|
+
from followthemoney.util import const, defer as _
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
class ChecksumType(PropertyType):
|
|
@@ -13,13 +12,10 @@ class ChecksumType(PropertyType):
|
|
|
13
12
|
of this type are scrubbed when submitted via the normal API. Checksums can only
|
|
14
13
|
be defined by uploading a document to be ingested."""
|
|
15
14
|
|
|
16
|
-
name = "checksum"
|
|
17
|
-
group = "checksums"
|
|
15
|
+
name = const("checksum")
|
|
16
|
+
group = const("checksums")
|
|
18
17
|
label = _("Checksum")
|
|
19
18
|
plural = _("Checksums")
|
|
20
19
|
matchable = True
|
|
21
20
|
pivot = True
|
|
22
21
|
max_length = 40
|
|
23
|
-
|
|
24
|
-
def rdf(self, value: str) -> Identifier:
|
|
25
|
-
return URIRef(f"hash:{value}")
|