followthemoney 4.3.0__py3-none-any.whl → 4.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +1 -1
- followthemoney/compare.py +6 -0
- followthemoney/dataset/dataset.py +18 -0
- followthemoney/entity.py +29 -15
- followthemoney/mapping/csv.py +3 -1
- followthemoney/model.py +6 -5
- followthemoney/property.py +23 -4
- followthemoney/proxy.py +32 -11
- followthemoney/schema/Company.yaml +5 -0
- followthemoney/schema/CryptoWallet.yaml +4 -0
- followthemoney/schema/Image.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +10 -0
- followthemoney/schema/Organization.yaml +5 -0
- followthemoney/schema/Person.yaml +4 -0
- followthemoney/schema/PublicBody.yaml +4 -0
- followthemoney/schema/Thing.yaml +3 -2
- followthemoney/schema.py +16 -2
- followthemoney/settings.py +19 -0
- followthemoney/statement/entity.py +31 -7
- followthemoney/statement/serialize.py +18 -13
- followthemoney/statement/statement.py +151 -42
- followthemoney/statement/util.py +23 -2
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -3
- followthemoney/types/country.py +19 -4
- followthemoney/types/date.py +13 -3
- followthemoney/types/entity.py +3 -3
- followthemoney/types/gender.py +6 -6
- followthemoney/types/identifier.py +8 -8
- followthemoney/types/ip.py +3 -3
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -3
- followthemoney/types/mimetype.py +3 -3
- followthemoney/types/name.py +3 -3
- followthemoney/types/number.py +2 -2
- followthemoney/types/phone.py +3 -3
- followthemoney/types/string.py +2 -2
- followthemoney/types/topic.py +6 -3
- followthemoney/types/url.py +3 -3
- followthemoney/util.py +6 -14
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/METADATA +3 -3
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/RECORD +45 -44
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/WHEEL +1 -1
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/entry_points.txt +0 -0
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,9 +5,10 @@ import logging
|
|
|
5
5
|
from io import TextIOWrapper
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from types import TracebackType
|
|
8
|
-
from typing import cast
|
|
8
|
+
from typing import Dict, Tuple, cast
|
|
9
9
|
from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
|
|
10
10
|
from rigour.boolean import text_bool
|
|
11
|
+
from rigour.env import ENCODING
|
|
11
12
|
|
|
12
13
|
from followthemoney.statement.statement import Statement, StatementDict
|
|
13
14
|
from followthemoney.statement.util import unpack_prop
|
|
@@ -60,7 +61,7 @@ def read_json_statements(
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
63
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
64
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
64
65
|
for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
|
|
65
66
|
data = cast(StatementDict, row)
|
|
66
67
|
data["external"] = text_bool(row.get("external")) or False
|
|
@@ -72,7 +73,7 @@ def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
|
72
73
|
|
|
73
74
|
|
|
74
75
|
def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
75
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
76
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
76
77
|
yield from read_pack_statements_decoded(wrapped)
|
|
77
78
|
|
|
78
79
|
|
|
@@ -129,10 +130,10 @@ def read_path_statements(path: Path, format: str) -> Generator[Statement, None,
|
|
|
129
130
|
|
|
130
131
|
def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
|
|
131
132
|
if format == CSV:
|
|
132
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
133
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
133
134
|
return CSVStatementWriter(wrapped)
|
|
134
135
|
elif format == PACK:
|
|
135
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
136
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
136
137
|
return PackStatementWriter(wrapped)
|
|
137
138
|
elif format == JSON:
|
|
138
139
|
return JSONStatementWriter(fh)
|
|
@@ -222,12 +223,14 @@ class PackStatementWriter(StatementWriter):
|
|
|
222
223
|
"id",
|
|
223
224
|
]
|
|
224
225
|
self.writer.writerow(columns)
|
|
225
|
-
self._batch:
|
|
226
|
+
self._batch: Dict[str, Tuple[Optional[str], ...]] = {}
|
|
226
227
|
|
|
227
228
|
def write(self, stmt: Statement) -> None:
|
|
228
229
|
# HACK: This is very similar to the CSV writer, but at the very inner
|
|
229
230
|
# loop of the application, so we're duplicating code here.
|
|
230
|
-
|
|
231
|
+
if stmt.id is None:
|
|
232
|
+
raise RuntimeError("Cannot write pack statement without ID")
|
|
233
|
+
row = (
|
|
231
234
|
stmt.entity_id,
|
|
232
235
|
f"{stmt.schema}:{stmt.prop}",
|
|
233
236
|
stmt.value,
|
|
@@ -239,13 +242,15 @@ class PackStatementWriter(StatementWriter):
|
|
|
239
242
|
stmt.first_seen,
|
|
240
243
|
stmt.last_seen,
|
|
241
244
|
stmt.id,
|
|
242
|
-
|
|
243
|
-
self._batch.
|
|
245
|
+
)
|
|
246
|
+
self._batch[stmt.id] = row
|
|
244
247
|
if len(self._batch) >= CSV_BATCH:
|
|
245
|
-
self.
|
|
246
|
-
|
|
248
|
+
self.flush()
|
|
249
|
+
|
|
250
|
+
def flush(self) -> None:
|
|
251
|
+
self.writer.writerows(self._batch.values())
|
|
252
|
+
self._batch.clear()
|
|
247
253
|
|
|
248
254
|
def close(self) -> None:
|
|
249
|
-
|
|
250
|
-
self.writer.writerows(self._batch)
|
|
255
|
+
self.flush()
|
|
251
256
|
self.fh.close()
|
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import warnings
|
|
3
3
|
from sqlalchemy.engine import Row
|
|
4
|
-
from typing import cast
|
|
5
|
-
from typing import Any, Dict, Generator, Optional
|
|
4
|
+
from typing import Union, cast
|
|
5
|
+
from typing import Any, Dict, Generator, Optional, TypeGuard
|
|
6
6
|
from typing_extensions import TypedDict, Self
|
|
7
7
|
from rigour.time import datetime_iso, iso_datetime
|
|
8
8
|
from rigour.boolean import bool_text
|
|
9
9
|
|
|
10
10
|
from followthemoney.proxy import EntityProxy
|
|
11
|
-
from followthemoney.statement.util import get_prop_type, BASE_ID
|
|
11
|
+
from followthemoney.statement.util import get_prop_type, BASE_ID, NON_LANG_TYPE_NAMES
|
|
12
|
+
from followthemoney.util import HASH_ENCODING
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
UNSET = object()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_not_unset(value: str | None | object) -> TypeGuard[str | None]:
|
|
19
|
+
return value is not UNSET
|
|
12
20
|
|
|
13
21
|
|
|
14
22
|
class StatementDict(TypedDict):
|
|
@@ -42,15 +50,16 @@ class Statement(object):
|
|
|
42
50
|
|
|
43
51
|
__slots__ = [
|
|
44
52
|
"id",
|
|
45
|
-
"
|
|
53
|
+
"_entity_id",
|
|
46
54
|
"canonical_id",
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
55
|
+
"_prop",
|
|
56
|
+
"_schema",
|
|
57
|
+
"_value",
|
|
58
|
+
"_dataset",
|
|
59
|
+
"_lang",
|
|
60
|
+
"prop_type",
|
|
52
61
|
"original_value",
|
|
53
|
-
"
|
|
62
|
+
"_external",
|
|
54
63
|
"first_seen",
|
|
55
64
|
"last_seen",
|
|
56
65
|
"origin",
|
|
@@ -72,55 +81,95 @@ class Statement(object):
|
|
|
72
81
|
last_seen: Optional[str] = None,
|
|
73
82
|
origin: Optional[str] = None,
|
|
74
83
|
):
|
|
75
|
-
self.
|
|
84
|
+
self._entity_id = entity_id
|
|
76
85
|
self.canonical_id = canonical_id or entity_id
|
|
77
|
-
self.
|
|
78
|
-
self.
|
|
79
|
-
self.
|
|
80
|
-
self.
|
|
81
|
-
self.
|
|
86
|
+
self._prop = prop
|
|
87
|
+
self._schema = schema
|
|
88
|
+
self.prop_type = get_prop_type(schema, prop)
|
|
89
|
+
self._value = value
|
|
90
|
+
self._dataset = dataset
|
|
91
|
+
|
|
92
|
+
# Remove lang for non-linguistic property types. The goal here is to avoid
|
|
93
|
+
# duplicate statements because of language tags, but the language metadata
|
|
94
|
+
# may be relevant as context for how the original_value was parsed so it's
|
|
95
|
+
# a bit of information loss.
|
|
96
|
+
if lang is not None:
|
|
97
|
+
if self.prop_type in NON_LANG_TYPE_NAMES:
|
|
98
|
+
lang = None
|
|
99
|
+
self._lang = lang
|
|
100
|
+
|
|
82
101
|
self.original_value = original_value
|
|
83
102
|
self.first_seen = first_seen
|
|
84
103
|
self.last_seen = last_seen or first_seen
|
|
85
|
-
self.
|
|
104
|
+
self._external = external
|
|
86
105
|
self.origin = origin
|
|
87
106
|
if id is None:
|
|
88
107
|
id = self.generate_key()
|
|
89
108
|
self.id = id
|
|
90
109
|
|
|
91
110
|
@property
|
|
92
|
-
def
|
|
93
|
-
"""The
|
|
94
|
-
return
|
|
111
|
+
def entity_id(self) -> str:
|
|
112
|
+
"""The (original) ID of the entity this statement is about."""
|
|
113
|
+
return self._entity_id
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def dataset(self) -> str:
|
|
117
|
+
"""The dataset this statement was observed in."""
|
|
118
|
+
return self._dataset
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def prop(self) -> str:
|
|
122
|
+
"""The property name this statement is about."""
|
|
123
|
+
return self._prop
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def schema(self) -> str:
|
|
127
|
+
"""The schema of the entity this statement is about."""
|
|
128
|
+
return self._schema
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def value(self) -> str:
|
|
132
|
+
"""The value of the property captured by this statement."""
|
|
133
|
+
return self._value
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def lang(self) -> Optional[str]:
|
|
137
|
+
"""The language of the property value, if applicable."""
|
|
138
|
+
return self._lang
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def external(self) -> bool:
|
|
142
|
+
"""Whether this statement was observed in an external dataset."""
|
|
143
|
+
return self._external
|
|
95
144
|
|
|
96
145
|
def to_dict(self) -> StatementDict:
|
|
97
146
|
return {
|
|
98
147
|
"canonical_id": self.canonical_id,
|
|
99
|
-
"entity_id": self.
|
|
100
|
-
"prop": self.
|
|
101
|
-
"schema": self.
|
|
102
|
-
"value": self.
|
|
103
|
-
"dataset": self.
|
|
104
|
-
"lang": self.
|
|
148
|
+
"entity_id": self._entity_id,
|
|
149
|
+
"prop": self._prop,
|
|
150
|
+
"schema": self._schema,
|
|
151
|
+
"value": self._value,
|
|
152
|
+
"dataset": self._dataset,
|
|
153
|
+
"lang": self._lang,
|
|
105
154
|
"original_value": self.original_value,
|
|
106
155
|
"first_seen": self.first_seen,
|
|
107
156
|
"last_seen": self.last_seen,
|
|
108
|
-
"external": self.
|
|
157
|
+
"external": self._external,
|
|
109
158
|
"origin": self.origin,
|
|
110
159
|
"id": self.id,
|
|
111
160
|
}
|
|
112
161
|
|
|
113
162
|
def to_csv_row(self) -> Dict[str, Optional[str]]:
|
|
114
163
|
data = cast(Dict[str, Optional[str]], self.to_dict())
|
|
115
|
-
data["external"] = bool_text(self.
|
|
116
|
-
data["prop_type"] =
|
|
164
|
+
data["external"] = bool_text(self._external)
|
|
165
|
+
data["prop_type"] = self.prop_type
|
|
117
166
|
return data
|
|
118
167
|
|
|
119
168
|
def to_db_row(self) -> Dict[str, Any]:
|
|
120
169
|
data = cast(Dict[str, Any], self.to_dict())
|
|
121
170
|
data["first_seen"] = iso_datetime(self.first_seen)
|
|
122
171
|
data["last_seen"] = iso_datetime(self.last_seen)
|
|
123
|
-
data["prop_type"] =
|
|
172
|
+
data["prop_type"] = self.prop_type
|
|
124
173
|
return data
|
|
125
174
|
|
|
126
175
|
def __hash__(self) -> int:
|
|
@@ -132,27 +181,83 @@ class Statement(object):
|
|
|
132
181
|
return hash(self.id)
|
|
133
182
|
|
|
134
183
|
def __repr__(self) -> str:
|
|
135
|
-
return "<Statement(%r, %r, %r)>" % (self.
|
|
184
|
+
return "<Statement(%r, %r, %r)>" % (self._entity_id, self._prop, self._value)
|
|
136
185
|
|
|
137
186
|
def __eq__(self, other: Any) -> bool:
|
|
138
187
|
return not self.id != other.id
|
|
139
188
|
|
|
140
189
|
def __lt__(self, other: Any) -> bool:
|
|
141
|
-
self_key = (self.
|
|
142
|
-
other_key = (other.
|
|
190
|
+
self_key = (self._prop != BASE_ID, self.id or "")
|
|
191
|
+
other_key = (other._prop != BASE_ID, other.id or "")
|
|
143
192
|
return self_key < other_key
|
|
144
193
|
|
|
145
|
-
def clone(
|
|
194
|
+
def clone(
|
|
195
|
+
self: Self,
|
|
196
|
+
*,
|
|
197
|
+
entity_id: Optional[str] = None,
|
|
198
|
+
prop: Optional[str] = None,
|
|
199
|
+
schema: Optional[str] = None,
|
|
200
|
+
value: Optional[str] = None,
|
|
201
|
+
dataset: Optional[str] = None,
|
|
202
|
+
lang: Union[str, None, object] = UNSET,
|
|
203
|
+
original_value: Union[str, None, object] = UNSET,
|
|
204
|
+
first_seen: Union[str, None, object] = UNSET,
|
|
205
|
+
external: Optional[bool] = None,
|
|
206
|
+
canonical_id: Optional[str] = None,
|
|
207
|
+
last_seen: Union[str, None, object] = UNSET,
|
|
208
|
+
origin: Union[str, None, object] = UNSET,
|
|
209
|
+
) -> "Statement":
|
|
146
210
|
"""Make a deep copy of the given statement."""
|
|
147
|
-
|
|
211
|
+
lang = lang if is_not_unset(lang) else self._lang
|
|
212
|
+
ov = original_value if is_not_unset(original_value) else self.original_value
|
|
213
|
+
first_seen = first_seen if is_not_unset(first_seen) else self.first_seen
|
|
214
|
+
last_seen = last_seen if is_not_unset(last_seen) else self.last_seen
|
|
215
|
+
origin = origin if is_not_unset(origin) else self.origin
|
|
216
|
+
if external is None:
|
|
217
|
+
external = self._external
|
|
218
|
+
if canonical_id is None and self._entity_id != self.canonical_id:
|
|
219
|
+
canonical_id = self.canonical_id
|
|
220
|
+
|
|
221
|
+
# Decide if the statement ID can be kept the same:
|
|
222
|
+
stmt_id = self.id
|
|
223
|
+
if entity_id is not None and entity_id != self.entity_id:
|
|
224
|
+
stmt_id = None
|
|
225
|
+
if prop is not None and prop != self._prop:
|
|
226
|
+
stmt_id = None
|
|
227
|
+
if schema is not None and schema != self._schema:
|
|
228
|
+
stmt_id = None
|
|
229
|
+
if value is not None and value != self._value:
|
|
230
|
+
stmt_id = None
|
|
231
|
+
if dataset is not None and dataset != self._dataset:
|
|
232
|
+
stmt_id = None
|
|
233
|
+
if external != self._external:
|
|
234
|
+
stmt_id = None
|
|
235
|
+
if lang != self._lang:
|
|
236
|
+
stmt_id = None
|
|
237
|
+
return Statement(
|
|
238
|
+
id=stmt_id,
|
|
239
|
+
entity_id=entity_id or self._entity_id,
|
|
240
|
+
prop=prop or self._prop,
|
|
241
|
+
schema=schema or self._schema,
|
|
242
|
+
value=value or self._value,
|
|
243
|
+
dataset=dataset or self._dataset,
|
|
244
|
+
lang=lang,
|
|
245
|
+
original_value=ov,
|
|
246
|
+
first_seen=first_seen,
|
|
247
|
+
external=external,
|
|
248
|
+
canonical_id=canonical_id,
|
|
249
|
+
last_seen=last_seen,
|
|
250
|
+
origin=origin,
|
|
251
|
+
)
|
|
148
252
|
|
|
149
253
|
def generate_key(self) -> Optional[str]:
|
|
150
254
|
return self.make_key(
|
|
151
|
-
self.
|
|
152
|
-
self.
|
|
153
|
-
self.
|
|
154
|
-
self.
|
|
155
|
-
self.
|
|
255
|
+
self._dataset,
|
|
256
|
+
self._entity_id,
|
|
257
|
+
self._prop,
|
|
258
|
+
self._value,
|
|
259
|
+
self._external,
|
|
260
|
+
lang=self._lang,
|
|
156
261
|
)
|
|
157
262
|
|
|
158
263
|
@classmethod
|
|
@@ -163,17 +268,21 @@ class Statement(object):
|
|
|
163
268
|
prop: str,
|
|
164
269
|
value: str,
|
|
165
270
|
external: Optional[bool],
|
|
271
|
+
lang: Optional[str] = None,
|
|
166
272
|
) -> Optional[str]:
|
|
167
273
|
"""Hash the key properties of a statement record to make a unique ID."""
|
|
168
274
|
if prop is None or value is None:
|
|
169
275
|
return None
|
|
170
|
-
|
|
276
|
+
if lang is None:
|
|
277
|
+
key = f"{dataset}.{entity_id}.{prop}.{value}"
|
|
278
|
+
else:
|
|
279
|
+
key = f"{dataset}.{entity_id}.{prop}.{value}@{lang}"
|
|
171
280
|
if external:
|
|
172
281
|
# We consider the external flag in key composition to avoid race conditions
|
|
173
282
|
# where a certain entity might be emitted as external while it is already
|
|
174
283
|
# linked in to the graph via another route.
|
|
175
284
|
key = f"{key}.ext"
|
|
176
|
-
return hashlib.sha1(key.encode(
|
|
285
|
+
return hashlib.sha1(key.encode(HASH_ENCODING)).hexdigest()
|
|
177
286
|
|
|
178
287
|
@classmethod
|
|
179
288
|
def from_dict(cls, data: StatementDict) -> "Statement":
|
followthemoney/statement/util.py
CHANGED
|
@@ -1,11 +1,32 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
from functools import cache
|
|
3
2
|
from typing import Tuple
|
|
4
3
|
|
|
5
4
|
from followthemoney.model import Model
|
|
5
|
+
from followthemoney.types import registry
|
|
6
|
+
from followthemoney.util import const
|
|
6
7
|
|
|
7
8
|
BASE_ID = "id"
|
|
8
9
|
|
|
10
|
+
# Some property types should not set the `lang` attribute on statements.
|
|
11
|
+
# These are typically non-linguistic types, although there's an argument
|
|
12
|
+
# that language metadata could be useful for dates and countries, where
|
|
13
|
+
# text parsing is likely to have taken place.
|
|
14
|
+
NON_LANG_TYPE_NAMES = {
|
|
15
|
+
registry.entity.name,
|
|
16
|
+
registry.date.name,
|
|
17
|
+
registry.checksum.name,
|
|
18
|
+
registry.email.name,
|
|
19
|
+
registry.phone.name,
|
|
20
|
+
registry.gender.name,
|
|
21
|
+
registry.mimetype.name,
|
|
22
|
+
registry.topic.name,
|
|
23
|
+
registry.url.name,
|
|
24
|
+
registry.country.name,
|
|
25
|
+
registry.language.name,
|
|
26
|
+
registry.ip.name,
|
|
27
|
+
BASE_ID,
|
|
28
|
+
}
|
|
29
|
+
|
|
9
30
|
|
|
10
31
|
def pack_prop(schema: str, prop: str) -> str:
|
|
11
32
|
return f"{schema}:{prop}"
|
|
@@ -28,4 +49,4 @@ def get_prop_type(schema: str, prop: str) -> str:
|
|
|
28
49
|
def unpack_prop(id: str) -> Tuple[str, str, str]:
|
|
29
50
|
schema, prop = id.split(":", 1)
|
|
30
51
|
prop_type = get_prop_type(schema, prop)
|
|
31
|
-
return
|
|
52
|
+
return const(schema), prop_type, const(prop)
|
followthemoney/types/address.py
CHANGED
|
@@ -6,7 +6,7 @@ from rigour.text.distance import levenshtein_similarity
|
|
|
6
6
|
|
|
7
7
|
from followthemoney.types.common import PropertyType
|
|
8
8
|
from followthemoney.util import defer as _
|
|
9
|
-
from followthemoney.util import dampen
|
|
9
|
+
from followthemoney.util import dampen
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from followthemoney.proxy import EntityProxy
|
|
@@ -20,8 +20,8 @@ class AddressType(PropertyType):
|
|
|
20
20
|
|
|
21
21
|
LINE_BREAKS = re.compile(r"(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)")
|
|
22
22
|
COMMATA = re.compile(r"(,\s?[,\.])")
|
|
23
|
-
name =
|
|
24
|
-
group =
|
|
23
|
+
name = "address"
|
|
24
|
+
group = "addresses"
|
|
25
25
|
label = _("Address")
|
|
26
26
|
plural = _("Addresses")
|
|
27
27
|
matchable = True
|
followthemoney/types/checksum.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from followthemoney.types.common import PropertyType
|
|
2
|
-
from followthemoney.util import
|
|
2
|
+
from followthemoney.util import defer as _
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class ChecksumType(PropertyType):
|
|
@@ -12,8 +12,8 @@ class ChecksumType(PropertyType):
|
|
|
12
12
|
of this type are scrubbed when submitted via the normal API. Checksums can only
|
|
13
13
|
be defined by uploading a document to be ingested."""
|
|
14
14
|
|
|
15
|
-
name =
|
|
16
|
-
group =
|
|
15
|
+
name = "checksum"
|
|
16
|
+
group = "checksums"
|
|
17
17
|
label = _("Checksum")
|
|
18
18
|
plural = _("Checksums")
|
|
19
19
|
matchable = True
|
followthemoney/types/country.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from typing import Optional, TYPE_CHECKING
|
|
1
|
+
from typing import Callable, Optional, TYPE_CHECKING, Sequence
|
|
2
2
|
from babel.core import Locale
|
|
3
3
|
from rigour.territories import get_ftm_countries, lookup_territory
|
|
4
|
+
from rigour.territories import territories_intersect
|
|
4
5
|
|
|
5
6
|
from followthemoney.types.common import EnumType, EnumValues
|
|
6
|
-
from followthemoney.util import
|
|
7
|
+
from followthemoney.util import defer as _
|
|
7
8
|
|
|
8
9
|
if TYPE_CHECKING:
|
|
9
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -15,8 +16,8 @@ class CountryType(EnumType):
|
|
|
15
16
|
a number of unusual and controversial designations (e.g. the Soviet Union,
|
|
16
17
|
Transnistria, Somaliland, Kosovo)."""
|
|
17
18
|
|
|
18
|
-
name =
|
|
19
|
-
group =
|
|
19
|
+
name = "country"
|
|
20
|
+
group = "countries"
|
|
20
21
|
label = _("Country")
|
|
21
22
|
plural = _("Countries")
|
|
22
23
|
matchable = True
|
|
@@ -25,6 +26,20 @@ class CountryType(EnumType):
|
|
|
25
26
|
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
26
27
|
return {t.code: t.name for t in get_ftm_countries()}
|
|
27
28
|
|
|
29
|
+
def compare(self, left: str, right: str) -> float:
|
|
30
|
+
overlap = territories_intersect([left], [right])
|
|
31
|
+
return 1.0 if len(overlap) else 0.0
|
|
32
|
+
|
|
33
|
+
def compare_sets(
|
|
34
|
+
self,
|
|
35
|
+
left: Sequence[str],
|
|
36
|
+
right: Sequence[str],
|
|
37
|
+
func: Callable[[Sequence[float]], float] = max,
|
|
38
|
+
) -> float:
|
|
39
|
+
"""Compare two sets of values and select the highest-scored result."""
|
|
40
|
+
overlap = territories_intersect(left, right)
|
|
41
|
+
return 1.0 if len(overlap) else 0.0
|
|
42
|
+
|
|
28
43
|
def clean_text(
|
|
29
44
|
self,
|
|
30
45
|
text: str,
|
followthemoney/types/date.py
CHANGED
|
@@ -5,7 +5,7 @@ from prefixdate import parse, parse_format, Precision
|
|
|
5
5
|
|
|
6
6
|
from followthemoney.types.common import PropertyType
|
|
7
7
|
from followthemoney.util import defer as _
|
|
8
|
-
from followthemoney.util import dampen
|
|
8
|
+
from followthemoney.util import dampen
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
from followthemoney.proxy import EntityProxy
|
|
@@ -20,13 +20,23 @@ class DateType(PropertyType):
|
|
|
20
20
|
The timezone is always expected to be UTC and cannot be specified otherwise. There is
|
|
21
21
|
no support for calendar weeks (`2021-W7`) and date ranges (`2021-2024`)."""
|
|
22
22
|
|
|
23
|
-
name =
|
|
24
|
-
group =
|
|
23
|
+
name = "date"
|
|
24
|
+
group = "dates"
|
|
25
25
|
label = _("Date")
|
|
26
26
|
plural = _("Dates")
|
|
27
27
|
matchable = True
|
|
28
28
|
max_length = 32
|
|
29
29
|
|
|
30
|
+
HISTORIC = "1001-01-01"
|
|
31
|
+
"""A sentinel date value representing a very old date, used to indicate historic (and often imprecise) dates
|
|
32
|
+
that can be assumed to be long in the past."""
|
|
33
|
+
|
|
34
|
+
RELEVANCE_MIN = "1900-01-01"
|
|
35
|
+
"""A cutoff date value representing the minimum relevant date for modern fincrime applications."""
|
|
36
|
+
|
|
37
|
+
RELEVANCE_MAX = "2100-12-31"
|
|
38
|
+
"""A cutoff date value representing the maximum relevant date for modern fincrime applications."""
|
|
39
|
+
|
|
30
40
|
def validate(
|
|
31
41
|
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
32
42
|
) -> bool:
|
followthemoney/types/entity.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Any, Optional, TYPE_CHECKING
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
5
|
from followthemoney.value import Value
|
|
6
6
|
from followthemoney.util import ENTITY_ID_LEN, get_entity_id, sanitize_text
|
|
7
|
-
from followthemoney.util import
|
|
7
|
+
from followthemoney.util import gettext, defer as _
|
|
8
8
|
from followthemoney.exc import InvalidData
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
@@ -22,8 +22,8 @@ class EntityType(PropertyType):
|
|
|
22
22
|
|
|
23
23
|
REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$"
|
|
24
24
|
REGEX = re.compile(REGEX_RAW)
|
|
25
|
-
name =
|
|
26
|
-
group =
|
|
25
|
+
name = "entity"
|
|
26
|
+
group = "entities"
|
|
27
27
|
label = _("Entity")
|
|
28
28
|
plural = _("Entities")
|
|
29
29
|
matchable = True
|
followthemoney/types/gender.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from babel.core import Locale
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import EnumType, EnumValues
|
|
5
|
-
from followthemoney.util import
|
|
5
|
+
from followthemoney.util import gettext, defer as _
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -14,9 +14,9 @@ class GenderType(EnumType):
|
|
|
14
14
|
government databases and represent it in a way that can be used by
|
|
15
15
|
structured tools. I'm not sure this justifies the simplification."""
|
|
16
16
|
|
|
17
|
-
MALE =
|
|
18
|
-
FEMALE =
|
|
19
|
-
OTHER =
|
|
17
|
+
MALE = "male"
|
|
18
|
+
FEMALE = "female"
|
|
19
|
+
OTHER = "other"
|
|
20
20
|
|
|
21
21
|
LOOKUP = {
|
|
22
22
|
"m": MALE,
|
|
@@ -34,8 +34,8 @@ class GenderType(EnumType):
|
|
|
34
34
|
"divers": OTHER,
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
-
name =
|
|
38
|
-
group =
|
|
37
|
+
name = "gender"
|
|
38
|
+
group = "genders"
|
|
39
39
|
label = _("Gender")
|
|
40
40
|
plural = _("Genders")
|
|
41
41
|
matchable = False
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import Optional, TYPE_CHECKING
|
|
3
|
-
from rigour.ids import
|
|
3
|
+
from rigour.ids import get_identifier_format
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
6
|
from followthemoney.util import dampen, shortest, longest
|
|
7
|
-
from followthemoney.util import
|
|
7
|
+
from followthemoney.util import defer as _
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -20,8 +20,8 @@ class IdentifierType(PropertyType):
|
|
|
20
20
|
Four- or five-digit industry classifiers create more noise than value."""
|
|
21
21
|
|
|
22
22
|
COMPARE_CLEAN = re.compile(r"[\W_]+")
|
|
23
|
-
name =
|
|
24
|
-
group =
|
|
23
|
+
name = "identifier"
|
|
24
|
+
group = "identifiers"
|
|
25
25
|
label = _("Identifier")
|
|
26
26
|
plural = _("Identifiers")
|
|
27
27
|
matchable = True
|
|
@@ -35,8 +35,8 @@ class IdentifierType(PropertyType):
|
|
|
35
35
|
format: Optional[str] = None,
|
|
36
36
|
proxy: Optional["EntityProxy"] = None,
|
|
37
37
|
) -> Optional[str]:
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
format_ = get_identifier_format(format)
|
|
39
|
+
if format_ is not None:
|
|
40
40
|
return format_.normalize(text)
|
|
41
41
|
return text
|
|
42
42
|
|
|
@@ -61,7 +61,7 @@ class IdentifierType(PropertyType):
|
|
|
61
61
|
return f"id:{value}"
|
|
62
62
|
|
|
63
63
|
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
format_ = get_identifier_format(format)
|
|
65
|
+
if format_ is not None:
|
|
66
66
|
return format_.format(value)
|
|
67
67
|
return value
|
followthemoney/types/ip.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from ipaddress import ip_address
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.util import
|
|
5
|
+
from followthemoney.util import defer as _
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -13,8 +13,8 @@ class IpType(PropertyType):
|
|
|
13
13
|
by the protocol versions 4 (e.g. `192.168.1.143`) and 6
|
|
14
14
|
(e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
|
|
15
15
|
|
|
16
|
-
name =
|
|
17
|
-
group =
|
|
16
|
+
name = "ip"
|
|
17
|
+
group = "ips"
|
|
18
18
|
label = _("IP Address")
|
|
19
19
|
plural = _("IP Addresses")
|
|
20
20
|
matchable = True
|
followthemoney/types/json.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Any, Optional, Sequence, TYPE_CHECKING
|
|
|
3
3
|
from banal import ensure_list
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
-
from followthemoney.util import
|
|
6
|
+
from followthemoney.util import sanitize_text, defer as _
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from followthemoney.proxy import EntityProxy
|
|
@@ -14,7 +14,7 @@ class JsonType(PropertyType):
|
|
|
14
14
|
and some other edge cases. It's a really bad idea and we should try to get rid
|
|
15
15
|
of JSON properties."""
|
|
16
16
|
|
|
17
|
-
name =
|
|
17
|
+
name = "json"
|
|
18
18
|
group = None
|
|
19
19
|
label = _("Nested data")
|
|
20
20
|
plural = _("Nested data")
|