followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +146 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +55 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +1 -2
- followthemoney/model.py +38 -11
- followthemoney/names.py +33 -0
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +43 -64
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +30 -42
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +438 -0
- followthemoney/statement/serialize.py +251 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import warnings
|
|
3
|
+
from sqlalchemy.engine import Row
|
|
4
|
+
from typing import cast
|
|
5
|
+
from typing import Any, Dict, Generator, Optional
|
|
6
|
+
from typing_extensions import TypedDict, Self
|
|
7
|
+
from rigour.time import datetime_iso, iso_datetime
|
|
8
|
+
from rigour.boolean import bool_text
|
|
9
|
+
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
11
|
+
from followthemoney.statement.util import get_prop_type, BASE_ID
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StatementDict(TypedDict):
|
|
15
|
+
id: Optional[str]
|
|
16
|
+
entity_id: str
|
|
17
|
+
canonical_id: str
|
|
18
|
+
prop: str
|
|
19
|
+
schema: str
|
|
20
|
+
value: str
|
|
21
|
+
dataset: str
|
|
22
|
+
lang: Optional[str]
|
|
23
|
+
original_value: Optional[str]
|
|
24
|
+
external: bool
|
|
25
|
+
first_seen: Optional[str]
|
|
26
|
+
last_seen: Optional[str]
|
|
27
|
+
origin: Optional[str]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Statement(object):
|
|
31
|
+
"""A single statement about a property relevant to an entity.
|
|
32
|
+
|
|
33
|
+
For example, this could be used to say: "In dataset A, entity X has the
|
|
34
|
+
property `name` set to 'John Smith'. I first observed this at K, and last
|
|
35
|
+
saw it at L."
|
|
36
|
+
|
|
37
|
+
Null property values are not supported. This might need to change if we
|
|
38
|
+
want to support making property-less entities.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
BASE = BASE_ID
|
|
42
|
+
|
|
43
|
+
__slots__ = [
|
|
44
|
+
"id",
|
|
45
|
+
"entity_id",
|
|
46
|
+
"canonical_id",
|
|
47
|
+
"prop",
|
|
48
|
+
"schema",
|
|
49
|
+
"value",
|
|
50
|
+
"dataset",
|
|
51
|
+
"lang",
|
|
52
|
+
"original_value",
|
|
53
|
+
"external",
|
|
54
|
+
"first_seen",
|
|
55
|
+
"last_seen",
|
|
56
|
+
"origin",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
entity_id: str,
|
|
62
|
+
prop: str,
|
|
63
|
+
schema: str,
|
|
64
|
+
value: str,
|
|
65
|
+
dataset: str,
|
|
66
|
+
lang: Optional[str] = None,
|
|
67
|
+
original_value: Optional[str] = None,
|
|
68
|
+
first_seen: Optional[str] = None,
|
|
69
|
+
external: bool = False,
|
|
70
|
+
id: Optional[str] = None,
|
|
71
|
+
canonical_id: Optional[str] = None,
|
|
72
|
+
last_seen: Optional[str] = None,
|
|
73
|
+
origin: Optional[str] = None,
|
|
74
|
+
):
|
|
75
|
+
self.entity_id = entity_id
|
|
76
|
+
self.canonical_id = canonical_id or entity_id
|
|
77
|
+
self.prop = prop
|
|
78
|
+
self.schema = schema
|
|
79
|
+
self.value = value
|
|
80
|
+
self.dataset = dataset
|
|
81
|
+
self.lang = lang
|
|
82
|
+
self.original_value = original_value
|
|
83
|
+
self.first_seen = first_seen
|
|
84
|
+
self.last_seen = last_seen or first_seen
|
|
85
|
+
self.external = external
|
|
86
|
+
self.origin = origin
|
|
87
|
+
if id is None:
|
|
88
|
+
id = self.generate_key()
|
|
89
|
+
self.id = id
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def prop_type(self) -> str:
|
|
93
|
+
"""The type of the property, e.g. 'string', 'number', 'url'."""
|
|
94
|
+
return get_prop_type(self.schema, self.prop)
|
|
95
|
+
|
|
96
|
+
def to_dict(self) -> StatementDict:
|
|
97
|
+
return {
|
|
98
|
+
"canonical_id": self.canonical_id,
|
|
99
|
+
"entity_id": self.entity_id,
|
|
100
|
+
"prop": self.prop,
|
|
101
|
+
"schema": self.schema,
|
|
102
|
+
"value": self.value,
|
|
103
|
+
"dataset": self.dataset,
|
|
104
|
+
"lang": self.lang,
|
|
105
|
+
"original_value": self.original_value,
|
|
106
|
+
"first_seen": self.first_seen,
|
|
107
|
+
"last_seen": self.last_seen,
|
|
108
|
+
"external": self.external,
|
|
109
|
+
"origin": self.origin,
|
|
110
|
+
"id": self.id,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
def to_csv_row(self) -> Dict[str, Optional[str]]:
|
|
114
|
+
data = cast(Dict[str, Optional[str]], self.to_dict())
|
|
115
|
+
data["external"] = bool_text(self.external)
|
|
116
|
+
data["prop_type"] = get_prop_type(self.schema, self.prop)
|
|
117
|
+
return data
|
|
118
|
+
|
|
119
|
+
def to_db_row(self) -> Dict[str, Any]:
|
|
120
|
+
data = cast(Dict[str, Any], self.to_dict())
|
|
121
|
+
data["first_seen"] = iso_datetime(self.first_seen)
|
|
122
|
+
data["last_seen"] = iso_datetime(self.last_seen)
|
|
123
|
+
data["prop_type"] = get_prop_type(self.schema, self.prop)
|
|
124
|
+
return data
|
|
125
|
+
|
|
126
|
+
def __hash__(self) -> int:
|
|
127
|
+
if self.id is None:
|
|
128
|
+
warnings.warn(
|
|
129
|
+
"Hashing a statement without an ID results in undefined behaviour",
|
|
130
|
+
RuntimeWarning,
|
|
131
|
+
)
|
|
132
|
+
return hash(self.id)
|
|
133
|
+
|
|
134
|
+
def __repr__(self) -> str:
|
|
135
|
+
return "<Statement(%r, %r, %r)>" % (self.entity_id, self.prop, self.value)
|
|
136
|
+
|
|
137
|
+
def __eq__(self, other: Any) -> bool:
|
|
138
|
+
return not self.id != other.id
|
|
139
|
+
|
|
140
|
+
def __lt__(self, other: Any) -> bool:
|
|
141
|
+
self_key = (self.prop != BASE_ID, self.id or "")
|
|
142
|
+
other_key = (other.prop != BASE_ID, other.id or "")
|
|
143
|
+
return self_key < other_key
|
|
144
|
+
|
|
145
|
+
def clone(self: Self) -> "Statement":
|
|
146
|
+
"""Make a deep copy of the given statement."""
|
|
147
|
+
return Statement.from_dict(self.to_dict())
|
|
148
|
+
|
|
149
|
+
def generate_key(self) -> Optional[str]:
|
|
150
|
+
return self.make_key(
|
|
151
|
+
self.dataset,
|
|
152
|
+
self.entity_id,
|
|
153
|
+
self.prop,
|
|
154
|
+
self.value,
|
|
155
|
+
self.external,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def make_key(
|
|
160
|
+
cls,
|
|
161
|
+
dataset: str,
|
|
162
|
+
entity_id: str,
|
|
163
|
+
prop: str,
|
|
164
|
+
value: str,
|
|
165
|
+
external: Optional[bool],
|
|
166
|
+
) -> Optional[str]:
|
|
167
|
+
"""Hash the key properties of a statement record to make a unique ID."""
|
|
168
|
+
if prop is None or value is None:
|
|
169
|
+
return None
|
|
170
|
+
key = f"{dataset}.{entity_id}.{prop}.{value}"
|
|
171
|
+
if external:
|
|
172
|
+
# We consider the external flag in key composition to avoid race conditions
|
|
173
|
+
# where a certain entity might be emitted as external while it is already
|
|
174
|
+
# linked in to the graph via another route.
|
|
175
|
+
key = f"{key}.ext"
|
|
176
|
+
return hashlib.sha1(key.encode("utf-8")).hexdigest()
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def from_dict(cls, data: StatementDict) -> "Statement":
|
|
180
|
+
return cls(
|
|
181
|
+
entity_id=data["entity_id"],
|
|
182
|
+
prop=data["prop"],
|
|
183
|
+
schema=data["schema"],
|
|
184
|
+
value=data["value"],
|
|
185
|
+
dataset=data["dataset"],
|
|
186
|
+
lang=data.get("lang", None),
|
|
187
|
+
original_value=data.get("original_value", None),
|
|
188
|
+
first_seen=data.get("first_seen", None),
|
|
189
|
+
external=data.get("external", False),
|
|
190
|
+
id=data.get("id", None),
|
|
191
|
+
canonical_id=data.get("canonical_id", None),
|
|
192
|
+
last_seen=data.get("last_seen", None),
|
|
193
|
+
origin=data.get("origin", None),
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
def from_db_row(cls, row: Row[Any]) -> "Statement":
|
|
198
|
+
return cls(
|
|
199
|
+
id=row.id,
|
|
200
|
+
canonical_id=row.canonical_id,
|
|
201
|
+
entity_id=row.entity_id,
|
|
202
|
+
prop=row.prop,
|
|
203
|
+
schema=row.schema,
|
|
204
|
+
value=row.value,
|
|
205
|
+
dataset=row.dataset,
|
|
206
|
+
lang=row.lang,
|
|
207
|
+
original_value=row.original_value,
|
|
208
|
+
first_seen=datetime_iso(row.first_seen),
|
|
209
|
+
external=row.external,
|
|
210
|
+
last_seen=datetime_iso(row.last_seen),
|
|
211
|
+
origin=row.origin,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def from_entity(
|
|
216
|
+
cls,
|
|
217
|
+
entity: "EntityProxy",
|
|
218
|
+
dataset: str,
|
|
219
|
+
first_seen: Optional[str] = None,
|
|
220
|
+
last_seen: Optional[str] = None,
|
|
221
|
+
external: bool = False,
|
|
222
|
+
origin: Optional[str] = None,
|
|
223
|
+
) -> Generator["Statement", None, None]:
|
|
224
|
+
from followthemoney.statement.entity import StatementEntity
|
|
225
|
+
|
|
226
|
+
if entity.id is None:
|
|
227
|
+
raise ValueError("Cannot create statements for entity without ID!")
|
|
228
|
+
|
|
229
|
+
# If the entity is already a StatementEntity, we return its statements directly.
|
|
230
|
+
if isinstance(entity, StatementEntity):
|
|
231
|
+
yield from entity.statements
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
yield cls(
|
|
235
|
+
entity_id=entity.id,
|
|
236
|
+
prop=BASE_ID,
|
|
237
|
+
schema=entity.schema.name,
|
|
238
|
+
value=entity.id,
|
|
239
|
+
dataset=dataset,
|
|
240
|
+
external=external,
|
|
241
|
+
first_seen=first_seen,
|
|
242
|
+
last_seen=last_seen,
|
|
243
|
+
origin=origin,
|
|
244
|
+
)
|
|
245
|
+
for prop, value in entity.itervalues():
|
|
246
|
+
yield cls(
|
|
247
|
+
entity_id=entity.id,
|
|
248
|
+
prop=prop.name,
|
|
249
|
+
schema=entity.schema.name,
|
|
250
|
+
value=value,
|
|
251
|
+
dataset=dataset,
|
|
252
|
+
external=external,
|
|
253
|
+
first_seen=first_seen,
|
|
254
|
+
last_seen=last_seen,
|
|
255
|
+
origin=origin,
|
|
256
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from functools import cache
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
|
|
5
|
+
from followthemoney.model import Model
|
|
6
|
+
|
|
7
|
+
BASE_ID = "id"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pack_prop(schema: str, prop: str) -> str:
|
|
11
|
+
return f"{schema}:{prop}"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@cache
|
|
15
|
+
def get_prop_type(schema: str, prop: str) -> str:
|
|
16
|
+
if prop == BASE_ID:
|
|
17
|
+
return BASE_ID
|
|
18
|
+
schema_obj = Model.instance().get(schema)
|
|
19
|
+
if schema_obj is None:
|
|
20
|
+
raise TypeError("Schema not found: %s" % schema)
|
|
21
|
+
prop_obj = schema_obj.get(prop)
|
|
22
|
+
if prop_obj is None:
|
|
23
|
+
raise TypeError("Property not found: %s" % prop)
|
|
24
|
+
return prop_obj.type.name
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@cache
|
|
28
|
+
def unpack_prop(id: str) -> Tuple[str, str, str]:
|
|
29
|
+
schema, prop = id.split(":", 1)
|
|
30
|
+
prop_type = get_prop_type(schema, prop)
|
|
31
|
+
return sys.intern(schema), prop_type, sys.intern(prop)
|
followthemoney/types/__init__.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from
|
|
1
|
+
from banal import ensure_list
|
|
2
|
+
from typing import Dict, Iterable, List, Set, cast
|
|
3
|
+
|
|
2
4
|
from followthemoney.types.url import UrlType
|
|
3
5
|
from followthemoney.types.name import NameType
|
|
4
6
|
from followthemoney.types.email import EmailType
|
|
@@ -11,7 +13,6 @@ from followthemoney.types.language import LanguageType
|
|
|
11
13
|
from followthemoney.types.mimetype import MimeType
|
|
12
14
|
from followthemoney.types.checksum import ChecksumType
|
|
13
15
|
from followthemoney.types.identifier import IdentifierType
|
|
14
|
-
from followthemoney.types.iban import IbanType
|
|
15
16
|
from followthemoney.types.entity import EntityType
|
|
16
17
|
from followthemoney.types.topic import TopicType
|
|
17
18
|
from followthemoney.types.gender import GenderType
|
|
@@ -22,27 +23,69 @@ from followthemoney.types.string import StringType
|
|
|
22
23
|
from followthemoney.types.number import NumberType
|
|
23
24
|
from followthemoney.types.common import PropertyType
|
|
24
25
|
|
|
26
|
+
|
|
27
|
+
class Registry(object):
|
|
28
|
+
"""This registry keeps the processing helpers for all property types in the system. The
|
|
29
|
+
registry can be used to get a type, which can itself then clean, validate or format values
|
|
30
|
+
of that type."""
|
|
31
|
+
|
|
32
|
+
url = UrlType()
|
|
33
|
+
name = NameType()
|
|
34
|
+
email = EmailType()
|
|
35
|
+
ip = IpType()
|
|
36
|
+
address = AddressType()
|
|
37
|
+
date = DateType()
|
|
38
|
+
phone = PhoneType()
|
|
39
|
+
country = CountryType()
|
|
40
|
+
language = LanguageType()
|
|
41
|
+
mimetype = MimeType()
|
|
42
|
+
checksum = ChecksumType()
|
|
43
|
+
identifier = IdentifierType()
|
|
44
|
+
entity = EntityType()
|
|
45
|
+
topic = TopicType()
|
|
46
|
+
gender = GenderType()
|
|
47
|
+
json = JsonType()
|
|
48
|
+
text = TextType()
|
|
49
|
+
html = HTMLType()
|
|
50
|
+
string = StringType()
|
|
51
|
+
number = NumberType()
|
|
52
|
+
|
|
53
|
+
def __init__(self) -> None:
|
|
54
|
+
self.matchable: Set[PropertyType] = set()
|
|
55
|
+
self.types: Set[PropertyType] = set()
|
|
56
|
+
self.groups: Dict[str, PropertyType] = {}
|
|
57
|
+
self.pivots: Set[PropertyType] = set()
|
|
58
|
+
for name in dir(self):
|
|
59
|
+
type_ = getattr(self, name)
|
|
60
|
+
if not isinstance(type_, PropertyType):
|
|
61
|
+
continue
|
|
62
|
+
assert type_.name == name
|
|
63
|
+
self.types.add(type_)
|
|
64
|
+
if type_.matchable:
|
|
65
|
+
self.matchable.add(type_)
|
|
66
|
+
if type_.pivot:
|
|
67
|
+
self.pivots.add(type_)
|
|
68
|
+
if type_.group is not None:
|
|
69
|
+
self.groups[type_.group] = type_
|
|
70
|
+
|
|
71
|
+
def get(self, name: str) -> PropertyType:
|
|
72
|
+
"""For a given property type name, get its type object. This can also
|
|
73
|
+
be used via getattr, e.g. ``registry.phone``."""
|
|
74
|
+
# Allow transparent re-checking.
|
|
75
|
+
if isinstance(name, PropertyType):
|
|
76
|
+
return name
|
|
77
|
+
return cast(PropertyType, getattr(self, name))
|
|
78
|
+
|
|
79
|
+
def get_types(self, names: Iterable[str]) -> List[PropertyType]:
|
|
80
|
+
"""Get a list of all property type objects linked to a set of names."""
|
|
81
|
+
names = ensure_list(names)
|
|
82
|
+
types = [self.get(n) for n in names]
|
|
83
|
+
return [t for t in types if t is not None]
|
|
84
|
+
|
|
85
|
+
def __getitem__(self, name: str) -> PropertyType:
|
|
86
|
+
return cast(PropertyType, getattr(self, name))
|
|
87
|
+
|
|
88
|
+
|
|
25
89
|
registry = Registry()
|
|
26
|
-
registry.add(UrlType)
|
|
27
|
-
registry.add(NameType)
|
|
28
|
-
registry.add(EmailType)
|
|
29
|
-
registry.add(IpType)
|
|
30
|
-
registry.add(AddressType)
|
|
31
|
-
registry.add(DateType)
|
|
32
|
-
registry.add(PhoneType)
|
|
33
|
-
registry.add(CountryType)
|
|
34
|
-
registry.add(LanguageType)
|
|
35
|
-
registry.add(MimeType)
|
|
36
|
-
registry.add(ChecksumType)
|
|
37
|
-
registry.add(IdentifierType)
|
|
38
|
-
registry.add(IbanType) # TODO: remove
|
|
39
|
-
registry.add(EntityType)
|
|
40
|
-
registry.add(TopicType)
|
|
41
|
-
registry.add(GenderType)
|
|
42
|
-
registry.add(JsonType)
|
|
43
|
-
registry.add(TextType)
|
|
44
|
-
registry.add(HTMLType)
|
|
45
|
-
registry.add(StringType)
|
|
46
|
-
registry.add(NumberType)
|
|
47
90
|
|
|
48
91
|
__all__ = ["PropertyType", "registry"]
|
followthemoney/types/address.py
CHANGED
|
@@ -7,7 +7,7 @@ from rigour.text.distance import levenshtein_similarity
|
|
|
7
7
|
|
|
8
8
|
from followthemoney.types.common import PropertyType
|
|
9
9
|
from followthemoney.util import defer as _
|
|
10
|
-
from followthemoney.util import dampen
|
|
10
|
+
from followthemoney.util import dampen, const
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from followthemoney.proxy import EntityProxy
|
|
@@ -21,8 +21,8 @@ class AddressType(PropertyType):
|
|
|
21
21
|
|
|
22
22
|
LINE_BREAKS = re.compile(r"(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)")
|
|
23
23
|
COMMATA = re.compile(r"(,\s?[,\.])")
|
|
24
|
-
name = "address"
|
|
25
|
-
group = "addresses"
|
|
24
|
+
name = const("address")
|
|
25
|
+
group = const("addresses")
|
|
26
26
|
label = _("Address")
|
|
27
27
|
plural = _("Addresses")
|
|
28
28
|
matchable = True
|
followthemoney/types/checksum.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
2
1
|
from followthemoney.types.common import PropertyType
|
|
3
|
-
from followthemoney.util import defer as _
|
|
2
|
+
from followthemoney.util import const, defer as _
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
class ChecksumType(PropertyType):
|
|
@@ -13,13 +12,10 @@ class ChecksumType(PropertyType):
|
|
|
13
12
|
of this type are scrubbed when submitted via the normal API. Checksums can only
|
|
14
13
|
be defined by uploading a document to be ingested."""
|
|
15
14
|
|
|
16
|
-
name = "checksum"
|
|
17
|
-
group = "checksums"
|
|
15
|
+
name = const("checksum")
|
|
16
|
+
group = const("checksums")
|
|
18
17
|
label = _("Checksum")
|
|
19
18
|
plural = _("Checksums")
|
|
20
19
|
matchable = True
|
|
21
20
|
pivot = True
|
|
22
21
|
max_length = 40
|
|
23
|
-
|
|
24
|
-
def rdf(self, value: str) -> Identifier:
|
|
25
|
-
return URIRef(f"hash:{value}")
|
followthemoney/types/common.py
CHANGED
|
@@ -5,8 +5,8 @@ from banal import ensure_list
|
|
|
5
5
|
from normality import stringify
|
|
6
6
|
from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict
|
|
7
7
|
|
|
8
|
-
from followthemoney.
|
|
9
|
-
from followthemoney.util import get_locale
|
|
8
|
+
from followthemoney.value import Value
|
|
9
|
+
from followthemoney.util import get_locale, const
|
|
10
10
|
from followthemoney.util import gettext, sanitize_text
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
@@ -29,7 +29,7 @@ class PropertyTypeToDict(TypedDict, total=False):
|
|
|
29
29
|
class PropertyType(object):
|
|
30
30
|
"""Base class for all property types."""
|
|
31
31
|
|
|
32
|
-
name: str = "any"
|
|
32
|
+
name: str = const("any")
|
|
33
33
|
"""A machine-facing, variable safe name for the given type."""
|
|
34
34
|
|
|
35
35
|
group: Optional[str] = None
|
|
@@ -87,7 +87,7 @@ class PropertyType(object):
|
|
|
87
87
|
|
|
88
88
|
def clean(
|
|
89
89
|
self,
|
|
90
|
-
raw:
|
|
90
|
+
raw: Value,
|
|
91
91
|
fuzzy: bool = False,
|
|
92
92
|
format: Optional[str] = None,
|
|
93
93
|
proxy: Optional["EntityProxy"] = None,
|
|
@@ -165,11 +165,6 @@ class PropertyType(object):
|
|
|
165
165
|
be related to (e.g. using a country prefix on a phone number or IBAN)."""
|
|
166
166
|
return None
|
|
167
167
|
|
|
168
|
-
def rdf(self, value: str) -> Identifier:
|
|
169
|
-
"""Return an RDF term to represent the given value - either a string
|
|
170
|
-
literal, or a URI reference."""
|
|
171
|
-
return Literal(value)
|
|
172
|
-
|
|
173
168
|
def pick(self, values: Sequence[str]) -> Optional[str]:
|
|
174
169
|
"""Pick the best value to show to the user."""
|
|
175
170
|
raise NotImplementedError
|
|
@@ -178,7 +173,7 @@ class PropertyType(object):
|
|
|
178
173
|
"""Return an ID suitable to identify this entity as a typed node in a
|
|
179
174
|
graph representation of some FtM data. It's usually the same as the the
|
|
180
175
|
RDF form."""
|
|
181
|
-
return
|
|
176
|
+
return f"{self.name}:{value}"
|
|
182
177
|
|
|
183
178
|
def node_id_safe(self, value: Optional[str]) -> Optional[str]:
|
|
184
179
|
"""Wrapper for node_id to handle None values."""
|
|
@@ -186,7 +181,7 @@ class PropertyType(object):
|
|
|
186
181
|
return None
|
|
187
182
|
return self.node_id(value)
|
|
188
183
|
|
|
189
|
-
def caption(self, value: str
|
|
184
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
190
185
|
"""Return a label for the given property value. This is often the same as the
|
|
191
186
|
value, but for types like countries or languages, it would return the label,
|
|
192
187
|
while other values like phone numbers can be formatted to be nicer to read."""
|
|
@@ -253,19 +248,19 @@ class EnumType(PropertyType):
|
|
|
253
248
|
|
|
254
249
|
def clean_text(
|
|
255
250
|
self,
|
|
256
|
-
|
|
251
|
+
text: str,
|
|
257
252
|
fuzzy: bool = False,
|
|
258
253
|
format: Optional[str] = None,
|
|
259
254
|
proxy: Optional["EntityProxy"] = None,
|
|
260
255
|
) -> Optional[str]:
|
|
261
256
|
"""All code values are cleaned to be lowercase and trailing whitespace is
|
|
262
257
|
removed."""
|
|
263
|
-
code =
|
|
258
|
+
code = text.lower().strip()
|
|
264
259
|
if code not in self.codes:
|
|
265
260
|
return None
|
|
266
261
|
return code
|
|
267
262
|
|
|
268
|
-
def caption(self, value: str) -> str:
|
|
263
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
269
264
|
"""Given a code value, return the label that should be shown to a user."""
|
|
270
265
|
return self.names.get(value, value)
|
|
271
266
|
|
followthemoney/types/country.py
CHANGED
|
@@ -3,9 +3,8 @@ from typing import Optional, TYPE_CHECKING
|
|
|
3
3
|
from babel.core import Locale
|
|
4
4
|
from rigour.territories import get_territory, get_ftm_countries
|
|
5
5
|
|
|
6
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
7
6
|
from followthemoney.types.common import EnumType, EnumValues
|
|
8
|
-
from followthemoney.util import defer as _
|
|
7
|
+
from followthemoney.util import const, defer as _
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -17,8 +16,8 @@ class CountryType(EnumType):
|
|
|
17
16
|
a number of unusual and controversial designations (e.g. the Soviet Union,
|
|
18
17
|
Transnistria, Somaliland, Kosovo)."""
|
|
19
18
|
|
|
20
|
-
name = "country"
|
|
21
|
-
group = "countries"
|
|
19
|
+
name = const("country")
|
|
20
|
+
group = const("countries")
|
|
22
21
|
label = _("Country")
|
|
23
22
|
plural = _("Countries")
|
|
24
23
|
matchable = True
|
|
@@ -52,6 +51,3 @@ class CountryType(EnumType):
|
|
|
52
51
|
|
|
53
52
|
def country_hint(self, value: str) -> str:
|
|
54
53
|
return value
|
|
55
|
-
|
|
56
|
-
def rdf(self, value: str) -> Identifier:
|
|
57
|
-
return URIRef(f"iso-3166:{value}")
|
followthemoney/types/date.py
CHANGED
|
@@ -4,9 +4,8 @@ from typing import Optional, TYPE_CHECKING
|
|
|
4
4
|
from prefixdate import parse, parse_format, Precision
|
|
5
5
|
|
|
6
6
|
from followthemoney.types.common import PropertyType
|
|
7
|
-
from followthemoney.rdf import XSD, Literal, Identifier
|
|
8
7
|
from followthemoney.util import defer as _
|
|
9
|
-
from followthemoney.util import dampen
|
|
8
|
+
from followthemoney.util import dampen, const
|
|
10
9
|
|
|
11
10
|
if TYPE_CHECKING:
|
|
12
11
|
from followthemoney.proxy import EntityProxy
|
|
@@ -21,8 +20,8 @@ class DateType(PropertyType):
|
|
|
21
20
|
The timezone is always expected to be UTC and cannot be specified otherwise. There is
|
|
22
21
|
no support for calendar weeks (`2021-W7`) and date ranges (`2021-2024`)."""
|
|
23
22
|
|
|
24
|
-
name = "date"
|
|
25
|
-
group = "dates"
|
|
23
|
+
name = const("date")
|
|
24
|
+
group = const("dates")
|
|
26
25
|
label = _("Date")
|
|
27
26
|
plural = _("Dates")
|
|
28
27
|
matchable = True
|
|
@@ -57,18 +56,29 @@ class DateType(PropertyType):
|
|
|
57
56
|
prefix = os.path.commonprefix([left, right])
|
|
58
57
|
return dampen(4, 10, prefix)
|
|
59
58
|
|
|
60
|
-
def
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
59
|
+
def to_datetime(self, value: str) -> Optional[datetime]:
|
|
60
|
+
"""Convert a date string to a datetime object in UTC for handling in Python. This
|
|
61
|
+
will convert the unset fields beyond the prefix to the first possible value, e.g.
|
|
62
|
+
`2021-02` will become `2021-02-01T00:00:00Z`.
|
|
64
63
|
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
Args:
|
|
65
|
+
value (str): The date string to convert.
|
|
67
66
|
|
|
68
|
-
|
|
67
|
+
Returns:
|
|
68
|
+
Optional[datetime]: The parsed datetime object in UTC, or None if parsing fails.
|
|
69
|
+
"""
|
|
69
70
|
return parse(value).dt
|
|
70
71
|
|
|
71
72
|
def to_number(self, value: str) -> Optional[float]:
|
|
73
|
+
"""Convert a date string to a number, which is the number of seconds since the epoch
|
|
74
|
+
(1970-01-01T00:00:00Z).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
value (str): The date string to convert.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Optional[float]: The timestamp as a float, or None if parsing fails.
|
|
81
|
+
"""
|
|
72
82
|
date = self.to_datetime(value)
|
|
73
83
|
if date is None:
|
|
74
84
|
return None
|
followthemoney/types/email.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Optional, TYPE_CHECKING
|
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
from normality.cleaning import strip_quotes
|
|
6
6
|
|
|
7
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
8
7
|
from followthemoney.types.common import PropertyType
|
|
9
8
|
from followthemoney.util import sanitize_text, defer as _
|
|
10
9
|
|
|
@@ -80,6 +79,3 @@ class EmailType(PropertyType):
|
|
|
80
79
|
|
|
81
80
|
# def country_hint(self, value)
|
|
82
81
|
# TODO: do we want to use TLDs as country evidence?
|
|
83
|
-
|
|
84
|
-
def rdf(self, value: str) -> Identifier:
|
|
85
|
-
return URIRef("mailto:%s" % value.lower())
|
followthemoney/types/entity.py
CHANGED
|
@@ -2,9 +2,9 @@ import re
|
|
|
2
2
|
from typing import Any, Optional, TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.
|
|
5
|
+
from followthemoney.value import Value
|
|
6
6
|
from followthemoney.util import ENTITY_ID_LEN, get_entity_id, sanitize_text
|
|
7
|
-
from followthemoney.util import gettext, defer as _
|
|
7
|
+
from followthemoney.util import const, gettext, defer as _
|
|
8
8
|
from followthemoney.exc import InvalidData
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
@@ -22,8 +22,8 @@ class EntityType(PropertyType):
|
|
|
22
22
|
|
|
23
23
|
REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$"
|
|
24
24
|
REGEX = re.compile(REGEX_RAW)
|
|
25
|
-
name = "entity"
|
|
26
|
-
group = "entities"
|
|
25
|
+
name = const("entity")
|
|
26
|
+
group = const("entities")
|
|
27
27
|
label = _("Entity")
|
|
28
28
|
plural = _("Entities")
|
|
29
29
|
matchable = True
|
|
@@ -31,7 +31,7 @@ class EntityType(PropertyType):
|
|
|
31
31
|
max_length = ENTITY_ID_LEN
|
|
32
32
|
|
|
33
33
|
def validate(
|
|
34
|
-
self, value:
|
|
34
|
+
self, value: Value, fuzzy: bool = False, format: Optional[str] = None
|
|
35
35
|
) -> bool:
|
|
36
36
|
text = sanitize_text(value)
|
|
37
37
|
if text is None:
|
|
@@ -66,9 +66,3 @@ class EntityType(PropertyType):
|
|
|
66
66
|
if self.REGEX.match(text) is not None:
|
|
67
67
|
return text
|
|
68
68
|
return None
|
|
69
|
-
|
|
70
|
-
def rdf(self, value: str) -> Identifier:
|
|
71
|
-
return URIRef(f"entity:{value}")
|
|
72
|
-
|
|
73
|
-
def caption(self, value: str) -> None:
|
|
74
|
-
return None
|