followthemoney 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -4
- followthemoney/cli/statement.py +13 -7
- followthemoney/cli/util.py +3 -3
- followthemoney/compare.py +6 -19
- followthemoney/dataset/__init__.py +2 -2
- followthemoney/dataset/dataset.py +20 -0
- followthemoney/entity.py +14 -0
- followthemoney/mapping/csv.py +3 -1
- followthemoney/model.py +4 -5
- followthemoney/proxy.py +27 -3
- followthemoney/schema/Company.yaml +1 -0
- followthemoney/schema/CryptoWallet.yaml +4 -0
- followthemoney/schema/Image.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +7 -0
- followthemoney/schema/Organization.yaml +1 -0
- followthemoney/schema/Person.yaml +2 -1
- followthemoney/schema/PublicBody.yaml +1 -0
- followthemoney/settings.py +19 -0
- followthemoney/statement/entity.py +39 -10
- followthemoney/statement/serialize.py +23 -14
- followthemoney/statement/statement.py +151 -42
- followthemoney/statement/util.py +21 -0
- followthemoney/types/country.py +16 -1
- followthemoney/types/date.py +10 -0
- followthemoney/types/language.py +1 -1
- followthemoney/util.py +6 -14
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/METADATA +3 -3
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/RECORD +31 -30
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/WHEEL +0 -0
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/entry_points.txt +0 -0
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
import csv
|
|
2
|
+
import sys
|
|
2
3
|
import click
|
|
3
4
|
import orjson
|
|
4
5
|
import logging
|
|
5
6
|
from io import TextIOWrapper
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from types import TracebackType
|
|
8
|
-
from typing import cast
|
|
9
|
+
from typing import Dict, Tuple, cast
|
|
9
10
|
from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
|
|
10
11
|
from rigour.boolean import text_bool
|
|
12
|
+
from rigour.env import ENCODING
|
|
11
13
|
|
|
12
14
|
from followthemoney.statement.statement import Statement, StatementDict
|
|
13
15
|
from followthemoney.statement.util import unpack_prop
|
|
@@ -48,6 +50,7 @@ LEGACY_PACK_COLUMNS = [
|
|
|
48
50
|
"first_seen",
|
|
49
51
|
"last_seen",
|
|
50
52
|
]
|
|
53
|
+
csv.field_size_limit(sys.maxsize)
|
|
51
54
|
|
|
52
55
|
|
|
53
56
|
def read_json_statements(
|
|
@@ -60,7 +63,7 @@ def read_json_statements(
|
|
|
60
63
|
|
|
61
64
|
|
|
62
65
|
def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
63
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
66
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
64
67
|
for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
|
|
65
68
|
data = cast(StatementDict, row)
|
|
66
69
|
data["external"] = text_bool(row.get("external")) or False
|
|
@@ -68,11 +71,13 @@ def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
|
68
71
|
data["lang"] = None
|
|
69
72
|
if row.get("original_value") == "":
|
|
70
73
|
data["original_value"] = None
|
|
74
|
+
if row.get("origin") == "":
|
|
75
|
+
data["origin"] = None
|
|
71
76
|
yield Statement.from_dict(data)
|
|
72
77
|
|
|
73
78
|
|
|
74
79
|
def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
|
|
75
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
80
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
76
81
|
yield from read_pack_statements_decoded(wrapped)
|
|
77
82
|
|
|
78
83
|
|
|
@@ -100,7 +105,7 @@ def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]
|
|
|
100
105
|
dataset=data["dataset"],
|
|
101
106
|
lang=data["lang"] or None,
|
|
102
107
|
original_value=data["original_value"] or None,
|
|
103
|
-
origin=data.get("origin"),
|
|
108
|
+
origin=data.get("origin") or None,
|
|
104
109
|
first_seen=data["first_seen"],
|
|
105
110
|
external=data["external"] == "t",
|
|
106
111
|
canonical_id=data["entity_id"],
|
|
@@ -129,10 +134,10 @@ def read_path_statements(path: Path, format: str) -> Generator[Statement, None,
|
|
|
129
134
|
|
|
130
135
|
def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
|
|
131
136
|
if format == CSV:
|
|
132
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
137
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
133
138
|
return CSVStatementWriter(wrapped)
|
|
134
139
|
elif format == PACK:
|
|
135
|
-
wrapped = TextIOWrapper(fh, encoding=
|
|
140
|
+
wrapped = TextIOWrapper(fh, encoding=ENCODING)
|
|
136
141
|
return PackStatementWriter(wrapped)
|
|
137
142
|
elif format == JSON:
|
|
138
143
|
return JSONStatementWriter(fh)
|
|
@@ -222,12 +227,14 @@ class PackStatementWriter(StatementWriter):
|
|
|
222
227
|
"id",
|
|
223
228
|
]
|
|
224
229
|
self.writer.writerow(columns)
|
|
225
|
-
self._batch:
|
|
230
|
+
self._batch: Dict[str, Tuple[Optional[str], ...]] = {}
|
|
226
231
|
|
|
227
232
|
def write(self, stmt: Statement) -> None:
|
|
228
233
|
# HACK: This is very similar to the CSV writer, but at the very inner
|
|
229
234
|
# loop of the application, so we're duplicating code here.
|
|
230
|
-
|
|
235
|
+
if stmt.id is None:
|
|
236
|
+
raise RuntimeError("Cannot write pack statement without ID")
|
|
237
|
+
row = (
|
|
231
238
|
stmt.entity_id,
|
|
232
239
|
f"{stmt.schema}:{stmt.prop}",
|
|
233
240
|
stmt.value,
|
|
@@ -239,13 +246,15 @@ class PackStatementWriter(StatementWriter):
|
|
|
239
246
|
stmt.first_seen,
|
|
240
247
|
stmt.last_seen,
|
|
241
248
|
stmt.id,
|
|
242
|
-
|
|
243
|
-
self._batch.
|
|
249
|
+
)
|
|
250
|
+
self._batch[stmt.id] = row
|
|
244
251
|
if len(self._batch) >= CSV_BATCH:
|
|
245
|
-
self.
|
|
246
|
-
|
|
252
|
+
self.flush()
|
|
253
|
+
|
|
254
|
+
def flush(self) -> None:
|
|
255
|
+
self.writer.writerows(self._batch.values())
|
|
256
|
+
self._batch.clear()
|
|
247
257
|
|
|
248
258
|
def close(self) -> None:
|
|
249
|
-
|
|
250
|
-
self.writer.writerows(self._batch)
|
|
259
|
+
self.flush()
|
|
251
260
|
self.fh.close()
|
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import warnings
|
|
3
3
|
from sqlalchemy.engine import Row
|
|
4
|
-
from typing import cast
|
|
5
|
-
from typing import Any, Dict, Generator, Optional
|
|
4
|
+
from typing import Union, cast
|
|
5
|
+
from typing import Any, Dict, Generator, Optional, TypeGuard
|
|
6
6
|
from typing_extensions import TypedDict, Self
|
|
7
7
|
from rigour.time import datetime_iso, iso_datetime
|
|
8
8
|
from rigour.boolean import bool_text
|
|
9
9
|
|
|
10
10
|
from followthemoney.proxy import EntityProxy
|
|
11
|
-
from followthemoney.statement.util import get_prop_type, BASE_ID
|
|
11
|
+
from followthemoney.statement.util import get_prop_type, BASE_ID, NON_LANG_TYPE_NAMES
|
|
12
|
+
from followthemoney.util import HASH_ENCODING
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
UNSET = object()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def is_not_unset(value: str | None | object) -> TypeGuard[str | None]:
|
|
19
|
+
return value is not UNSET
|
|
12
20
|
|
|
13
21
|
|
|
14
22
|
class StatementDict(TypedDict):
|
|
@@ -42,15 +50,16 @@ class Statement(object):
|
|
|
42
50
|
|
|
43
51
|
__slots__ = [
|
|
44
52
|
"id",
|
|
45
|
-
"
|
|
53
|
+
"_entity_id",
|
|
46
54
|
"canonical_id",
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
55
|
+
"_prop",
|
|
56
|
+
"_schema",
|
|
57
|
+
"_value",
|
|
58
|
+
"_dataset",
|
|
59
|
+
"_lang",
|
|
60
|
+
"prop_type",
|
|
52
61
|
"original_value",
|
|
53
|
-
"
|
|
62
|
+
"_external",
|
|
54
63
|
"first_seen",
|
|
55
64
|
"last_seen",
|
|
56
65
|
"origin",
|
|
@@ -72,55 +81,95 @@ class Statement(object):
|
|
|
72
81
|
last_seen: Optional[str] = None,
|
|
73
82
|
origin: Optional[str] = None,
|
|
74
83
|
):
|
|
75
|
-
self.
|
|
84
|
+
self._entity_id = entity_id
|
|
76
85
|
self.canonical_id = canonical_id or entity_id
|
|
77
|
-
self.
|
|
78
|
-
self.
|
|
79
|
-
self.
|
|
80
|
-
self.
|
|
81
|
-
self.
|
|
86
|
+
self._prop = prop
|
|
87
|
+
self._schema = schema
|
|
88
|
+
self.prop_type = get_prop_type(schema, prop)
|
|
89
|
+
self._value = value
|
|
90
|
+
self._dataset = dataset
|
|
91
|
+
|
|
92
|
+
# Remove lang for non-linguistic property types. The goal here is to avoid
|
|
93
|
+
# duplicate statements because of language tags, but the language metadata
|
|
94
|
+
# may be relevant as context for how the original_value was parsed so it's
|
|
95
|
+
# a bit of information loss.
|
|
96
|
+
if lang is not None:
|
|
97
|
+
if self.prop_type in NON_LANG_TYPE_NAMES:
|
|
98
|
+
lang = None
|
|
99
|
+
self._lang = lang
|
|
100
|
+
|
|
82
101
|
self.original_value = original_value
|
|
83
102
|
self.first_seen = first_seen
|
|
84
103
|
self.last_seen = last_seen or first_seen
|
|
85
|
-
self.
|
|
104
|
+
self._external = external
|
|
86
105
|
self.origin = origin
|
|
87
106
|
if id is None:
|
|
88
107
|
id = self.generate_key()
|
|
89
108
|
self.id = id
|
|
90
109
|
|
|
91
110
|
@property
|
|
92
|
-
def
|
|
93
|
-
"""The
|
|
94
|
-
return
|
|
111
|
+
def entity_id(self) -> str:
|
|
112
|
+
"""The (original) ID of the entity this statement is about."""
|
|
113
|
+
return self._entity_id
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def dataset(self) -> str:
|
|
117
|
+
"""The dataset this statement was observed in."""
|
|
118
|
+
return self._dataset
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def prop(self) -> str:
|
|
122
|
+
"""The property name this statement is about."""
|
|
123
|
+
return self._prop
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def schema(self) -> str:
|
|
127
|
+
"""The schema of the entity this statement is about."""
|
|
128
|
+
return self._schema
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def value(self) -> str:
|
|
132
|
+
"""The value of the property captured by this statement."""
|
|
133
|
+
return self._value
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def lang(self) -> Optional[str]:
|
|
137
|
+
"""The language of the property value, if applicable."""
|
|
138
|
+
return self._lang
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def external(self) -> bool:
|
|
142
|
+
"""Whether this statement was observed in an external dataset."""
|
|
143
|
+
return self._external
|
|
95
144
|
|
|
96
145
|
def to_dict(self) -> StatementDict:
|
|
97
146
|
return {
|
|
98
147
|
"canonical_id": self.canonical_id,
|
|
99
|
-
"entity_id": self.
|
|
100
|
-
"prop": self.
|
|
101
|
-
"schema": self.
|
|
102
|
-
"value": self.
|
|
103
|
-
"dataset": self.
|
|
104
|
-
"lang": self.
|
|
148
|
+
"entity_id": self._entity_id,
|
|
149
|
+
"prop": self._prop,
|
|
150
|
+
"schema": self._schema,
|
|
151
|
+
"value": self._value,
|
|
152
|
+
"dataset": self._dataset,
|
|
153
|
+
"lang": self._lang,
|
|
105
154
|
"original_value": self.original_value,
|
|
106
155
|
"first_seen": self.first_seen,
|
|
107
156
|
"last_seen": self.last_seen,
|
|
108
|
-
"external": self.
|
|
157
|
+
"external": self._external,
|
|
109
158
|
"origin": self.origin,
|
|
110
159
|
"id": self.id,
|
|
111
160
|
}
|
|
112
161
|
|
|
113
162
|
def to_csv_row(self) -> Dict[str, Optional[str]]:
|
|
114
163
|
data = cast(Dict[str, Optional[str]], self.to_dict())
|
|
115
|
-
data["external"] = bool_text(self.
|
|
116
|
-
data["prop_type"] =
|
|
164
|
+
data["external"] = bool_text(self._external)
|
|
165
|
+
data["prop_type"] = self.prop_type
|
|
117
166
|
return data
|
|
118
167
|
|
|
119
168
|
def to_db_row(self) -> Dict[str, Any]:
|
|
120
169
|
data = cast(Dict[str, Any], self.to_dict())
|
|
121
170
|
data["first_seen"] = iso_datetime(self.first_seen)
|
|
122
171
|
data["last_seen"] = iso_datetime(self.last_seen)
|
|
123
|
-
data["prop_type"] =
|
|
172
|
+
data["prop_type"] = self.prop_type
|
|
124
173
|
return data
|
|
125
174
|
|
|
126
175
|
def __hash__(self) -> int:
|
|
@@ -132,27 +181,83 @@ class Statement(object):
|
|
|
132
181
|
return hash(self.id)
|
|
133
182
|
|
|
134
183
|
def __repr__(self) -> str:
|
|
135
|
-
return "<Statement(%r, %r, %r)>" % (self.
|
|
184
|
+
return "<Statement(%r, %r, %r)>" % (self._entity_id, self._prop, self._value)
|
|
136
185
|
|
|
137
186
|
def __eq__(self, other: Any) -> bool:
|
|
138
187
|
return not self.id != other.id
|
|
139
188
|
|
|
140
189
|
def __lt__(self, other: Any) -> bool:
|
|
141
|
-
self_key = (self.
|
|
142
|
-
other_key = (other.
|
|
190
|
+
self_key = (self._prop != BASE_ID, self.id or "")
|
|
191
|
+
other_key = (other._prop != BASE_ID, other.id or "")
|
|
143
192
|
return self_key < other_key
|
|
144
193
|
|
|
145
|
-
def clone(
|
|
194
|
+
def clone(
|
|
195
|
+
self: Self,
|
|
196
|
+
*,
|
|
197
|
+
entity_id: Optional[str] = None,
|
|
198
|
+
prop: Optional[str] = None,
|
|
199
|
+
schema: Optional[str] = None,
|
|
200
|
+
value: Optional[str] = None,
|
|
201
|
+
dataset: Optional[str] = None,
|
|
202
|
+
lang: Union[str, None, object] = UNSET,
|
|
203
|
+
original_value: Union[str, None, object] = UNSET,
|
|
204
|
+
first_seen: Union[str, None, object] = UNSET,
|
|
205
|
+
external: Optional[bool] = None,
|
|
206
|
+
canonical_id: Optional[str] = None,
|
|
207
|
+
last_seen: Union[str, None, object] = UNSET,
|
|
208
|
+
origin: Union[str, None, object] = UNSET,
|
|
209
|
+
) -> "Statement":
|
|
146
210
|
"""Make a deep copy of the given statement."""
|
|
147
|
-
|
|
211
|
+
lang = lang if is_not_unset(lang) else self._lang
|
|
212
|
+
ov = original_value if is_not_unset(original_value) else self.original_value
|
|
213
|
+
first_seen = first_seen if is_not_unset(first_seen) else self.first_seen
|
|
214
|
+
last_seen = last_seen if is_not_unset(last_seen) else self.last_seen
|
|
215
|
+
origin = origin if is_not_unset(origin) else self.origin
|
|
216
|
+
if external is None:
|
|
217
|
+
external = self._external
|
|
218
|
+
if canonical_id is None and self._entity_id != self.canonical_id:
|
|
219
|
+
canonical_id = self.canonical_id
|
|
220
|
+
|
|
221
|
+
# Decide if the statement ID can be kept the same:
|
|
222
|
+
stmt_id = self.id
|
|
223
|
+
if entity_id is not None and entity_id != self.entity_id:
|
|
224
|
+
stmt_id = None
|
|
225
|
+
if prop is not None and prop != self._prop:
|
|
226
|
+
stmt_id = None
|
|
227
|
+
if schema is not None and schema != self._schema:
|
|
228
|
+
stmt_id = None
|
|
229
|
+
if value is not None and value != self._value:
|
|
230
|
+
stmt_id = None
|
|
231
|
+
if dataset is not None and dataset != self._dataset:
|
|
232
|
+
stmt_id = None
|
|
233
|
+
if external != self._external:
|
|
234
|
+
stmt_id = None
|
|
235
|
+
if lang != self._lang:
|
|
236
|
+
stmt_id = None
|
|
237
|
+
return Statement(
|
|
238
|
+
id=stmt_id,
|
|
239
|
+
entity_id=entity_id or self._entity_id,
|
|
240
|
+
prop=prop or self._prop,
|
|
241
|
+
schema=schema or self._schema,
|
|
242
|
+
value=value or self._value,
|
|
243
|
+
dataset=dataset or self._dataset,
|
|
244
|
+
lang=lang,
|
|
245
|
+
original_value=ov,
|
|
246
|
+
first_seen=first_seen,
|
|
247
|
+
external=external,
|
|
248
|
+
canonical_id=canonical_id,
|
|
249
|
+
last_seen=last_seen,
|
|
250
|
+
origin=origin,
|
|
251
|
+
)
|
|
148
252
|
|
|
149
253
|
def generate_key(self) -> Optional[str]:
|
|
150
254
|
return self.make_key(
|
|
151
|
-
self.
|
|
152
|
-
self.
|
|
153
|
-
self.
|
|
154
|
-
self.
|
|
155
|
-
self.
|
|
255
|
+
self._dataset,
|
|
256
|
+
self._entity_id,
|
|
257
|
+
self._prop,
|
|
258
|
+
self._value,
|
|
259
|
+
self._external,
|
|
260
|
+
lang=self._lang,
|
|
156
261
|
)
|
|
157
262
|
|
|
158
263
|
@classmethod
|
|
@@ -163,17 +268,21 @@ class Statement(object):
|
|
|
163
268
|
prop: str,
|
|
164
269
|
value: str,
|
|
165
270
|
external: Optional[bool],
|
|
271
|
+
lang: Optional[str] = None,
|
|
166
272
|
) -> Optional[str]:
|
|
167
273
|
"""Hash the key properties of a statement record to make a unique ID."""
|
|
168
274
|
if prop is None or value is None:
|
|
169
275
|
return None
|
|
170
|
-
|
|
276
|
+
if lang is None:
|
|
277
|
+
key = f"{dataset}.{entity_id}.{prop}.{value}"
|
|
278
|
+
else:
|
|
279
|
+
key = f"{dataset}.{entity_id}.{prop}.{value}@{lang}"
|
|
171
280
|
if external:
|
|
172
281
|
# We consider the external flag in key composition to avoid race conditions
|
|
173
282
|
# where a certain entity might be emitted as external while it is already
|
|
174
283
|
# linked in to the graph via another route.
|
|
175
284
|
key = f"{key}.ext"
|
|
176
|
-
return hashlib.sha1(key.encode(
|
|
285
|
+
return hashlib.sha1(key.encode(HASH_ENCODING)).hexdigest()
|
|
177
286
|
|
|
178
287
|
@classmethod
|
|
179
288
|
def from_dict(cls, data: StatementDict) -> "Statement":
|
followthemoney/statement/util.py
CHANGED
|
@@ -2,10 +2,31 @@ from functools import cache
|
|
|
2
2
|
from typing import Tuple
|
|
3
3
|
|
|
4
4
|
from followthemoney.model import Model
|
|
5
|
+
from followthemoney.types import registry
|
|
5
6
|
from followthemoney.util import const
|
|
6
7
|
|
|
7
8
|
BASE_ID = "id"
|
|
8
9
|
|
|
10
|
+
# Some property types should not set the `lang` attribute on statements.
|
|
11
|
+
# These are typically non-linguistic types, although there's an argument
|
|
12
|
+
# that language metadata could be useful for dates and countries, where
|
|
13
|
+
# text parsing is likely to have taken place.
|
|
14
|
+
NON_LANG_TYPE_NAMES = {
|
|
15
|
+
registry.entity.name,
|
|
16
|
+
registry.date.name,
|
|
17
|
+
registry.checksum.name,
|
|
18
|
+
registry.email.name,
|
|
19
|
+
registry.phone.name,
|
|
20
|
+
registry.gender.name,
|
|
21
|
+
registry.mimetype.name,
|
|
22
|
+
registry.topic.name,
|
|
23
|
+
registry.url.name,
|
|
24
|
+
registry.country.name,
|
|
25
|
+
registry.language.name,
|
|
26
|
+
registry.ip.name,
|
|
27
|
+
BASE_ID,
|
|
28
|
+
}
|
|
29
|
+
|
|
9
30
|
|
|
10
31
|
def pack_prop(schema: str, prop: str) -> str:
|
|
11
32
|
return f"{schema}:{prop}"
|
followthemoney/types/country.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import Optional, TYPE_CHECKING
|
|
1
|
+
from typing import Callable, Optional, TYPE_CHECKING, Sequence
|
|
2
2
|
from babel.core import Locale
|
|
3
3
|
from rigour.territories import get_ftm_countries, lookup_territory
|
|
4
|
+
from rigour.territories import territories_intersect
|
|
4
5
|
|
|
5
6
|
from followthemoney.types.common import EnumType, EnumValues
|
|
6
7
|
from followthemoney.util import defer as _
|
|
@@ -25,6 +26,20 @@ class CountryType(EnumType):
|
|
|
25
26
|
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
26
27
|
return {t.code: t.name for t in get_ftm_countries()}
|
|
27
28
|
|
|
29
|
+
def compare(self, left: str, right: str) -> float:
|
|
30
|
+
overlap = territories_intersect([left], [right])
|
|
31
|
+
return 1.0 if len(overlap) else 0.0
|
|
32
|
+
|
|
33
|
+
def compare_sets(
|
|
34
|
+
self,
|
|
35
|
+
left: Sequence[str],
|
|
36
|
+
right: Sequence[str],
|
|
37
|
+
func: Callable[[Sequence[float]], float] = max,
|
|
38
|
+
) -> float:
|
|
39
|
+
"""Compare two sets of values and select the highest-scored result."""
|
|
40
|
+
overlap = territories_intersect(left, right)
|
|
41
|
+
return 1.0 if len(overlap) else 0.0
|
|
42
|
+
|
|
28
43
|
def clean_text(
|
|
29
44
|
self,
|
|
30
45
|
text: str,
|
followthemoney/types/date.py
CHANGED
|
@@ -27,6 +27,16 @@ class DateType(PropertyType):
|
|
|
27
27
|
matchable = True
|
|
28
28
|
max_length = 32
|
|
29
29
|
|
|
30
|
+
HISTORIC = "1001-01-01"
|
|
31
|
+
"""A sentinel date value representing a very old date, used to indicate historic (and often imprecise) dates
|
|
32
|
+
that can be assumed to be long in the past."""
|
|
33
|
+
|
|
34
|
+
RELEVANCE_MIN = "1900-01-01"
|
|
35
|
+
"""A cutoff date value representing the minimum relevant date for modern fincrime applications."""
|
|
36
|
+
|
|
37
|
+
RELEVANCE_MAX = "2100-12-31"
|
|
38
|
+
"""A cutoff date value representing the maximum relevant date for modern fincrime applications."""
|
|
39
|
+
|
|
30
40
|
def validate(
|
|
31
41
|
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
32
42
|
) -> bool:
|
followthemoney/types/language.py
CHANGED
|
@@ -4,7 +4,7 @@ from rigour.langs import iso_639_alpha3
|
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import EnumType, EnumValues
|
|
6
6
|
from followthemoney.util import defer as _, gettext
|
|
7
|
-
from followthemoney.
|
|
7
|
+
from followthemoney.settings import get_env_list
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from followthemoney.proxy import EntityProxy
|
followthemoney/util.py
CHANGED
|
@@ -10,10 +10,11 @@ from threading import local
|
|
|
10
10
|
from typing import cast, Dict, Any, List, Optional, TypeVar, Union
|
|
11
11
|
from normality import stringify
|
|
12
12
|
from normality.cleaning import remove_unsafe_chars
|
|
13
|
-
from
|
|
13
|
+
from rigour.env import ENCODING
|
|
14
14
|
from banal import is_mapping, unique_list, ensure_list
|
|
15
15
|
|
|
16
16
|
MEGABYTE = 1024 * 1024
|
|
17
|
+
HASH_ENCODING = "utf-8"
|
|
17
18
|
DEFAULT_LOCALE = "en"
|
|
18
19
|
ENTITY_ID_LEN = 200
|
|
19
20
|
|
|
@@ -55,16 +56,7 @@ def get_locale() -> Locale:
|
|
|
55
56
|
return Locale.parse(state.locale)
|
|
56
57
|
|
|
57
58
|
|
|
58
|
-
def
|
|
59
|
-
value = stringify(os.environ.get(name))
|
|
60
|
-
if value is not None:
|
|
61
|
-
values = value.split(":")
|
|
62
|
-
if len(values):
|
|
63
|
-
return values
|
|
64
|
-
return default
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
|
|
59
|
+
def sanitize_text(value: Any, encoding: str = ENCODING) -> Optional[str]:
|
|
68
60
|
text = stringify(value, encoding_default=encoding)
|
|
69
61
|
if text is None:
|
|
70
62
|
return None
|
|
@@ -74,8 +66,8 @@ def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]
|
|
|
74
66
|
log.warning("Cannot NFC text: %s", ex)
|
|
75
67
|
return None
|
|
76
68
|
text = remove_unsafe_chars(text)
|
|
77
|
-
byte_text = text.encode(
|
|
78
|
-
text = byte_text.decode(
|
|
69
|
+
byte_text = text.encode("utf-8", "replace")
|
|
70
|
+
text = byte_text.decode("utf-8", "replace")
|
|
79
71
|
if len(text) == 0:
|
|
80
72
|
return None
|
|
81
73
|
return text
|
|
@@ -88,7 +80,7 @@ def key_bytes(key: Any) -> bytes:
|
|
|
88
80
|
text = stringify(key)
|
|
89
81
|
if text is None:
|
|
90
82
|
return b""
|
|
91
|
-
return text.encode(
|
|
83
|
+
return text.encode(ENCODING)
|
|
92
84
|
|
|
93
85
|
|
|
94
86
|
def join_text(*parts: Any, sep: str = " ") -> Optional[str]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: followthemoney
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.5.1
|
|
4
4
|
Summary: A data model for anti corruption data modeling and analysis.
|
|
5
5
|
Project-URL: Documentation, https://followthemoney.tech/
|
|
6
6
|
Project-URL: Repository, https://github.com/opensanctions/followthemoney.git
|
|
@@ -48,9 +48,9 @@ Requires-Dist: prefixdate<1.0.0,>=0.5.0
|
|
|
48
48
|
Requires-Dist: pydantic<3.0.0,>=2.11.0
|
|
49
49
|
Requires-Dist: pytz>=2021.1
|
|
50
50
|
Requires-Dist: pyyaml<7.0.0,>=5.0.0
|
|
51
|
-
Requires-Dist: rdflib<7.
|
|
51
|
+
Requires-Dist: rdflib<7.6.0,>=6.2.0
|
|
52
52
|
Requires-Dist: requests<3.0.0,>=2.21.0
|
|
53
|
-
Requires-Dist: rigour<2.0.0,>=1.
|
|
53
|
+
Requires-Dist: rigour<2.0.0,>=1.6.0
|
|
54
54
|
Requires-Dist: sqlalchemy[mypy]<3.0.0,>=2.0.0
|
|
55
55
|
Provides-Extra: dev
|
|
56
56
|
Requires-Dist: build; extra == 'dev'
|