followthemoney 3.8.4__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/__init__.py +3 -12
- followthemoney/cli/aggregate.py +1 -1
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/mapping.py +6 -4
- followthemoney/cli/sieve.py +1 -1
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +137 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +58 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/common.py +3 -3
- followthemoney/export/csv.py +10 -12
- followthemoney/export/neo4j.py +1 -1
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +6 -4
- followthemoney/mapping/csv.py +6 -18
- followthemoney/mapping/sql.py +3 -4
- followthemoney/model.py +36 -9
- followthemoney/namespace.py +3 -1
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +44 -65
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +27 -39
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +437 -0
- followthemoney/statement/serialize.py +245 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/METADATA +38 -34
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/RECORD +78 -69
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/entry_points.txt +1 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/licenses/LICENSE +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/WHEEL +0 -0
followthemoney/types/common.py
CHANGED
|
@@ -5,8 +5,8 @@ from banal import ensure_list
|
|
|
5
5
|
from normality import stringify
|
|
6
6
|
from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict
|
|
7
7
|
|
|
8
|
-
from followthemoney.
|
|
9
|
-
from followthemoney.util import get_locale
|
|
8
|
+
from followthemoney.value import Value
|
|
9
|
+
from followthemoney.util import get_locale, const
|
|
10
10
|
from followthemoney.util import gettext, sanitize_text
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
@@ -29,7 +29,7 @@ class PropertyTypeToDict(TypedDict, total=False):
|
|
|
29
29
|
class PropertyType(object):
|
|
30
30
|
"""Base class for all property types."""
|
|
31
31
|
|
|
32
|
-
name: str = "any"
|
|
32
|
+
name: str = const("any")
|
|
33
33
|
"""A machine-facing, variable safe name for the given type."""
|
|
34
34
|
|
|
35
35
|
group: Optional[str] = None
|
|
@@ -87,7 +87,7 @@ class PropertyType(object):
|
|
|
87
87
|
|
|
88
88
|
def clean(
|
|
89
89
|
self,
|
|
90
|
-
raw:
|
|
90
|
+
raw: Value,
|
|
91
91
|
fuzzy: bool = False,
|
|
92
92
|
format: Optional[str] = None,
|
|
93
93
|
proxy: Optional["EntityProxy"] = None,
|
|
@@ -165,11 +165,6 @@ class PropertyType(object):
|
|
|
165
165
|
be related to (e.g. using a country prefix on a phone number or IBAN)."""
|
|
166
166
|
return None
|
|
167
167
|
|
|
168
|
-
def rdf(self, value: str) -> Identifier:
|
|
169
|
-
"""Return an RDF term to represent the given value - either a string
|
|
170
|
-
literal, or a URI reference."""
|
|
171
|
-
return Literal(value)
|
|
172
|
-
|
|
173
168
|
def pick(self, values: Sequence[str]) -> Optional[str]:
|
|
174
169
|
"""Pick the best value to show to the user."""
|
|
175
170
|
raise NotImplementedError
|
|
@@ -178,7 +173,7 @@ class PropertyType(object):
|
|
|
178
173
|
"""Return an ID suitable to identify this entity as a typed node in a
|
|
179
174
|
graph representation of some FtM data. It's usually the same as the the
|
|
180
175
|
RDF form."""
|
|
181
|
-
return
|
|
176
|
+
return f"{self.name}:{value}"
|
|
182
177
|
|
|
183
178
|
def node_id_safe(self, value: Optional[str]) -> Optional[str]:
|
|
184
179
|
"""Wrapper for node_id to handle None values."""
|
|
@@ -186,7 +181,7 @@ class PropertyType(object):
|
|
|
186
181
|
return None
|
|
187
182
|
return self.node_id(value)
|
|
188
183
|
|
|
189
|
-
def caption(self, value: str
|
|
184
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
190
185
|
"""Return a label for the given property value. This is often the same as the
|
|
191
186
|
value, but for types like countries or languages, it would return the label,
|
|
192
187
|
while other values like phone numbers can be formatted to be nicer to read."""
|
|
@@ -253,19 +248,19 @@ class EnumType(PropertyType):
|
|
|
253
248
|
|
|
254
249
|
def clean_text(
|
|
255
250
|
self,
|
|
256
|
-
|
|
251
|
+
text: str,
|
|
257
252
|
fuzzy: bool = False,
|
|
258
253
|
format: Optional[str] = None,
|
|
259
254
|
proxy: Optional["EntityProxy"] = None,
|
|
260
255
|
) -> Optional[str]:
|
|
261
256
|
"""All code values are cleaned to be lowercase and trailing whitespace is
|
|
262
257
|
removed."""
|
|
263
|
-
code =
|
|
258
|
+
code = text.lower().strip()
|
|
264
259
|
if code not in self.codes:
|
|
265
260
|
return None
|
|
266
261
|
return code
|
|
267
262
|
|
|
268
|
-
def caption(self, value: str) -> str:
|
|
263
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
269
264
|
"""Given a code value, return the label that should be shown to a user."""
|
|
270
265
|
return self.names.get(value, value)
|
|
271
266
|
|
followthemoney/types/country.py
CHANGED
|
@@ -3,9 +3,8 @@ from typing import Optional, TYPE_CHECKING
|
|
|
3
3
|
from babel.core import Locale
|
|
4
4
|
from rigour.territories import get_territory, get_ftm_countries
|
|
5
5
|
|
|
6
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
7
6
|
from followthemoney.types.common import EnumType, EnumValues
|
|
8
|
-
from followthemoney.util import defer as _
|
|
7
|
+
from followthemoney.util import const, defer as _
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -17,8 +16,8 @@ class CountryType(EnumType):
|
|
|
17
16
|
a number of unusual and controversial designations (e.g. the Soviet Union,
|
|
18
17
|
Transnistria, Somaliland, Kosovo)."""
|
|
19
18
|
|
|
20
|
-
name = "country"
|
|
21
|
-
group = "countries"
|
|
19
|
+
name = const("country")
|
|
20
|
+
group = const("countries")
|
|
22
21
|
label = _("Country")
|
|
23
22
|
plural = _("Countries")
|
|
24
23
|
matchable = True
|
|
@@ -52,6 +51,3 @@ class CountryType(EnumType):
|
|
|
52
51
|
|
|
53
52
|
def country_hint(self, value: str) -> str:
|
|
54
53
|
return value
|
|
55
|
-
|
|
56
|
-
def rdf(self, value: str) -> Identifier:
|
|
57
|
-
return URIRef(f"iso-3166:{value}")
|
followthemoney/types/date.py
CHANGED
|
@@ -4,9 +4,8 @@ from typing import Optional, TYPE_CHECKING
|
|
|
4
4
|
from prefixdate import parse, parse_format, Precision
|
|
5
5
|
|
|
6
6
|
from followthemoney.types.common import PropertyType
|
|
7
|
-
from followthemoney.rdf import XSD, Literal, Identifier
|
|
8
7
|
from followthemoney.util import defer as _
|
|
9
|
-
from followthemoney.util import dampen
|
|
8
|
+
from followthemoney.util import dampen, const
|
|
10
9
|
|
|
11
10
|
if TYPE_CHECKING:
|
|
12
11
|
from followthemoney.proxy import EntityProxy
|
|
@@ -21,8 +20,8 @@ class DateType(PropertyType):
|
|
|
21
20
|
The timezone is always expected to be UTC and cannot be specified otherwise. There is
|
|
22
21
|
no support for calendar weeks (`2021-W7`) and date ranges (`2021-2024`)."""
|
|
23
22
|
|
|
24
|
-
name = "date"
|
|
25
|
-
group = "dates"
|
|
23
|
+
name = const("date")
|
|
24
|
+
group = const("dates")
|
|
26
25
|
label = _("Date")
|
|
27
26
|
plural = _("Dates")
|
|
28
27
|
matchable = True
|
|
@@ -57,18 +56,29 @@ class DateType(PropertyType):
|
|
|
57
56
|
prefix = os.path.commonprefix([left, right])
|
|
58
57
|
return dampen(4, 10, prefix)
|
|
59
58
|
|
|
60
|
-
def
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
59
|
+
def to_datetime(self, value: str) -> Optional[datetime]:
|
|
60
|
+
"""Convert a date string to a datetime object in UTC for handling in Python. This
|
|
61
|
+
will convert the unset fields beyond the prefix to the first possible value, e.g.
|
|
62
|
+
`2021-02` will become `2021-02-01T00:00:00Z`.
|
|
64
63
|
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
Args:
|
|
65
|
+
value (str): The date string to convert.
|
|
67
66
|
|
|
68
|
-
|
|
67
|
+
Returns:
|
|
68
|
+
Optional[datetime]: The parsed datetime object in UTC, or None if parsing fails.
|
|
69
|
+
"""
|
|
69
70
|
return parse(value).dt
|
|
70
71
|
|
|
71
72
|
def to_number(self, value: str) -> Optional[float]:
|
|
73
|
+
"""Convert a date string to a number, which is the number of seconds since the epoch
|
|
74
|
+
(1970-01-01T00:00:00Z).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
value (str): The date string to convert.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Optional[float]: The timestamp as a float, or None if parsing fails.
|
|
81
|
+
"""
|
|
72
82
|
date = self.to_datetime(value)
|
|
73
83
|
if date is None:
|
|
74
84
|
return None
|
followthemoney/types/email.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Optional, TYPE_CHECKING
|
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
from normality.cleaning import strip_quotes
|
|
6
6
|
|
|
7
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
8
7
|
from followthemoney.types.common import PropertyType
|
|
9
8
|
from followthemoney.util import sanitize_text, defer as _
|
|
10
9
|
|
|
@@ -80,6 +79,3 @@ class EmailType(PropertyType):
|
|
|
80
79
|
|
|
81
80
|
# def country_hint(self, value)
|
|
82
81
|
# TODO: do we want to use TLDs as country evidence?
|
|
83
|
-
|
|
84
|
-
def rdf(self, value: str) -> Identifier:
|
|
85
|
-
return URIRef("mailto:%s" % value.lower())
|
followthemoney/types/entity.py
CHANGED
|
@@ -2,9 +2,9 @@ import re
|
|
|
2
2
|
from typing import Any, Optional, TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.
|
|
5
|
+
from followthemoney.value import Value
|
|
6
6
|
from followthemoney.util import ENTITY_ID_LEN, get_entity_id, sanitize_text
|
|
7
|
-
from followthemoney.util import gettext, defer as _
|
|
7
|
+
from followthemoney.util import const, gettext, defer as _
|
|
8
8
|
from followthemoney.exc import InvalidData
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
@@ -22,8 +22,8 @@ class EntityType(PropertyType):
|
|
|
22
22
|
|
|
23
23
|
REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$"
|
|
24
24
|
REGEX = re.compile(REGEX_RAW)
|
|
25
|
-
name = "entity"
|
|
26
|
-
group = "entities"
|
|
25
|
+
name = const("entity")
|
|
26
|
+
group = const("entities")
|
|
27
27
|
label = _("Entity")
|
|
28
28
|
plural = _("Entities")
|
|
29
29
|
matchable = True
|
|
@@ -31,7 +31,7 @@ class EntityType(PropertyType):
|
|
|
31
31
|
max_length = ENTITY_ID_LEN
|
|
32
32
|
|
|
33
33
|
def validate(
|
|
34
|
-
self, value:
|
|
34
|
+
self, value: Value, fuzzy: bool = False, format: Optional[str] = None
|
|
35
35
|
) -> bool:
|
|
36
36
|
text = sanitize_text(value)
|
|
37
37
|
if text is None:
|
|
@@ -66,9 +66,3 @@ class EntityType(PropertyType):
|
|
|
66
66
|
if self.REGEX.match(text) is not None:
|
|
67
67
|
return text
|
|
68
68
|
return None
|
|
69
|
-
|
|
70
|
-
def rdf(self, value: str) -> Identifier:
|
|
71
|
-
return URIRef(f"entity:{value}")
|
|
72
|
-
|
|
73
|
-
def caption(self, value: str) -> None:
|
|
74
|
-
return None
|
followthemoney/types/gender.py
CHANGED
|
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from babel.core import Locale
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import EnumType, EnumValues
|
|
5
|
-
from followthemoney.
|
|
6
|
-
from followthemoney.util import gettext, defer as _
|
|
5
|
+
from followthemoney.util import const, gettext, defer as _
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -15,9 +14,9 @@ class GenderType(EnumType):
|
|
|
15
14
|
government databases and represent it in a way that can be used by
|
|
16
15
|
structured tools. I'm not sure this justifies the simplification."""
|
|
17
16
|
|
|
18
|
-
MALE = "male"
|
|
19
|
-
FEMALE = "female"
|
|
20
|
-
OTHER = "other"
|
|
17
|
+
MALE = const("male")
|
|
18
|
+
FEMALE = const("female")
|
|
19
|
+
OTHER = const("other")
|
|
21
20
|
|
|
22
21
|
LOOKUP = {
|
|
23
22
|
"m": MALE,
|
|
@@ -35,8 +34,8 @@ class GenderType(EnumType):
|
|
|
35
34
|
"divers": OTHER,
|
|
36
35
|
}
|
|
37
36
|
|
|
38
|
-
name = "gender"
|
|
39
|
-
group = "genders"
|
|
37
|
+
name = const("gender")
|
|
38
|
+
group = const("genders")
|
|
40
39
|
label = _("Gender")
|
|
41
40
|
plural = _("Genders")
|
|
42
41
|
matchable = False
|
|
@@ -61,6 +60,3 @@ class GenderType(EnumType):
|
|
|
61
60
|
if code not in self.codes:
|
|
62
61
|
return None
|
|
63
62
|
return code
|
|
64
|
-
|
|
65
|
-
def rdf(self, value: str) -> Identifier:
|
|
66
|
-
return URIRef(f"gender:{value}")
|
|
@@ -4,7 +4,7 @@ from rigour.ids import get_identifier_format_names, get_identifier_format
|
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
6
|
from followthemoney.util import dampen, shortest, longest
|
|
7
|
-
from followthemoney.util import defer as _
|
|
7
|
+
from followthemoney.util import const, defer as _
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -20,8 +20,8 @@ class IdentifierType(PropertyType):
|
|
|
20
20
|
Four- or five-digit industry classifiers create more noise than value."""
|
|
21
21
|
|
|
22
22
|
COMPARE_CLEAN = re.compile(r"[\W_]+")
|
|
23
|
-
name = "identifier"
|
|
24
|
-
group = "identifiers"
|
|
23
|
+
name = const("identifier")
|
|
24
|
+
group = const("identifiers")
|
|
25
25
|
label = _("Identifier")
|
|
26
26
|
plural = _("Identifiers")
|
|
27
27
|
matchable = True
|
|
@@ -59,3 +59,9 @@ class IdentifierType(PropertyType):
|
|
|
59
59
|
|
|
60
60
|
def node_id(self, value: str) -> str:
|
|
61
61
|
return f"id:{value}"
|
|
62
|
+
|
|
63
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
64
|
+
if format in get_identifier_format_names():
|
|
65
|
+
format_ = get_identifier_format(format)
|
|
66
|
+
return format_.format(value)
|
|
67
|
+
return value
|
followthemoney/types/ip.py
CHANGED
|
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from ipaddress import ip_address
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.
|
|
6
|
-
from followthemoney.util import defer as _
|
|
5
|
+
from followthemoney.util import const, defer as _
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -14,10 +13,10 @@ class IpType(PropertyType):
|
|
|
14
13
|
by the protocol versions 4 (e.g. `192.168.1.143`) and 6
|
|
15
14
|
(e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
|
|
16
15
|
|
|
17
|
-
name = "ip"
|
|
18
|
-
group = "ips"
|
|
19
|
-
label = _("IP
|
|
20
|
-
plural = _("IP
|
|
16
|
+
name = const("ip")
|
|
17
|
+
group = const("ips")
|
|
18
|
+
label = _("IP Address")
|
|
19
|
+
plural = _("IP Addresses")
|
|
21
20
|
matchable = True
|
|
22
21
|
pivot = True
|
|
23
22
|
max_length = 64
|
|
@@ -45,6 +44,3 @@ class IpType(PropertyType):
|
|
|
45
44
|
return str(ip_address(text))
|
|
46
45
|
except ValueError:
|
|
47
46
|
return None
|
|
48
|
-
|
|
49
|
-
def rdf(self, value: str) -> Identifier:
|
|
50
|
-
return URIRef(f"ip:{value}")
|
followthemoney/types/json.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Any, Optional, Sequence, TYPE_CHECKING
|
|
|
3
3
|
from banal import ensure_list
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
-
from followthemoney.util import sanitize_text, defer as _
|
|
6
|
+
from followthemoney.util import const, sanitize_text, defer as _
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from followthemoney.proxy import EntityProxy
|
|
@@ -14,7 +14,7 @@ class JsonType(PropertyType):
|
|
|
14
14
|
and some other edge cases. It's a really bad idea and we should try to get rid
|
|
15
15
|
of JSON properties."""
|
|
16
16
|
|
|
17
|
-
name = "json"
|
|
17
|
+
name = const("json")
|
|
18
18
|
group = None
|
|
19
19
|
label = _("Nested data")
|
|
20
20
|
plural = _("Nested data")
|
followthemoney/types/language.py
CHANGED
|
@@ -3,9 +3,8 @@ from babel.core import Locale
|
|
|
3
3
|
from rigour.langs import iso_639_alpha3
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import EnumType, EnumValues
|
|
6
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
7
6
|
from followthemoney.util import defer as _, gettext
|
|
8
|
-
from followthemoney.util import get_env_list
|
|
7
|
+
from followthemoney.util import const, get_env_list
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -17,8 +16,8 @@ class LanguageType(EnumType):
|
|
|
17
16
|
for additional languages once there is a specific need for them to be
|
|
18
17
|
supported."""
|
|
19
18
|
|
|
20
|
-
name = "language"
|
|
21
|
-
group = "languages"
|
|
19
|
+
name = const("language")
|
|
20
|
+
group = const("languages")
|
|
22
21
|
label = _("Language")
|
|
23
22
|
plural = _("Languages")
|
|
24
23
|
matchable = False
|
|
@@ -120,6 +119,3 @@ class LanguageType(EnumType):
|
|
|
120
119
|
if code not in self.LANGUAGES:
|
|
121
120
|
return None
|
|
122
121
|
return code
|
|
123
|
-
|
|
124
|
-
def rdf(self, value: str) -> Identifier:
|
|
125
|
-
return URIRef(f"iso-639:{value}")
|
followthemoney/types/mimetype.py
CHANGED
|
@@ -3,8 +3,7 @@ from rigour.mime import normalize_mimetype, parse_mimetype
|
|
|
3
3
|
from rigour.mime import DEFAULT
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
-
from followthemoney.
|
|
7
|
-
from followthemoney.util import defer as _
|
|
6
|
+
from followthemoney.util import const, defer as _
|
|
8
7
|
|
|
9
8
|
if TYPE_CHECKING:
|
|
10
9
|
from followthemoney.proxy import EntityProxy
|
|
@@ -19,8 +18,8 @@ class MimeType(PropertyType):
|
|
|
19
18
|
MIME type properties do not contain parameters as used in HTTP headers,
|
|
20
19
|
like `charset=UTF-8`."""
|
|
21
20
|
|
|
22
|
-
name = "mimetype"
|
|
23
|
-
group = "mimetypes"
|
|
21
|
+
name = const("mimetype")
|
|
22
|
+
group = const("mimetypes")
|
|
24
23
|
label = _("MIME-Type")
|
|
25
24
|
plural = _("MIME-Types")
|
|
26
25
|
matchable = False
|
|
@@ -37,8 +36,5 @@ class MimeType(PropertyType):
|
|
|
37
36
|
return text
|
|
38
37
|
return None
|
|
39
38
|
|
|
40
|
-
def
|
|
41
|
-
return URIRef(f"urn:mimetype:{value}")
|
|
42
|
-
|
|
43
|
-
def caption(self, value: str) -> str:
|
|
39
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
44
40
|
return parse_mimetype(value).label or value
|
followthemoney/types/name.py
CHANGED
|
@@ -2,13 +2,12 @@ from typing import TYPE_CHECKING, Optional, Sequence
|
|
|
2
2
|
from normality import slugify
|
|
3
3
|
from normality.cleaning import collapse_spaces, strip_quotes
|
|
4
4
|
from rigour.env import MAX_NAME_LENGTH
|
|
5
|
-
from rigour.names import pick_name
|
|
5
|
+
from rigour.names import pick_name, tokenize_name
|
|
6
6
|
from rigour.text.distance import levenshtein_similarity
|
|
7
|
-
from fingerprints.cleanup import clean_name_light
|
|
8
7
|
|
|
9
8
|
from followthemoney.types.common import PropertyType
|
|
10
9
|
from followthemoney.util import dampen
|
|
11
|
-
from followthemoney.util import defer as _
|
|
10
|
+
from followthemoney.util import const, defer as _
|
|
12
11
|
|
|
13
12
|
if TYPE_CHECKING:
|
|
14
13
|
from followthemoney.proxy import EntityProxy
|
|
@@ -22,8 +21,8 @@ class NameType(PropertyType):
|
|
|
22
21
|
No validation rules apply, and things having multiple names must be considered
|
|
23
22
|
a perfectly ordinary case."""
|
|
24
23
|
|
|
25
|
-
name = "name"
|
|
26
|
-
group = "names"
|
|
24
|
+
name = const("name")
|
|
25
|
+
group = const("names")
|
|
27
26
|
label = _("Name")
|
|
28
27
|
plural = _("Names")
|
|
29
28
|
matchable = True
|
|
@@ -51,9 +50,9 @@ class NameType(PropertyType):
|
|
|
51
50
|
|
|
52
51
|
def compare(self, left: str, right: str) -> float:
|
|
53
52
|
"""Compare two names for similarity."""
|
|
54
|
-
left_clean =
|
|
55
|
-
right_clean =
|
|
56
|
-
if left_clean
|
|
53
|
+
left_clean = " ".join(tokenize_name(left.lower()))
|
|
54
|
+
right_clean = " ".join(tokenize_name(right.lower()))
|
|
55
|
+
if not len(left_clean) or not len(right_clean):
|
|
57
56
|
return 0.0
|
|
58
57
|
return levenshtein_similarity(
|
|
59
58
|
left_clean,
|
followthemoney/types/number.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.util import defer as _
|
|
5
|
+
from followthemoney.util import const, defer as _
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class NumberType(PropertyType):
|
|
@@ -13,8 +13,18 @@ class NumberType(PropertyType):
|
|
|
13
13
|
In the future we might want to enable annotations for format, units, or
|
|
14
14
|
even to introduce a separate property type for monetary values."""
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
DECIMAL = "."
|
|
17
|
+
SEPARATOR = ","
|
|
18
|
+
PRECISION = 2
|
|
19
|
+
|
|
20
|
+
_NUM_UNIT_RE = (
|
|
21
|
+
f"(\\s?\\-?\\s?\\d+(?:{re.escape(DECIMAL)}\\d+)?)\\s*([^\\s\\d][^\\s]*)?"
|
|
22
|
+
)
|
|
23
|
+
NUM_UNIT_RE = re.compile(_NUM_UNIT_RE, re.UNICODE)
|
|
24
|
+
_FLOAT_FMT = "{:" + SEPARATOR + "." + str(PRECISION) + "f}"
|
|
25
|
+
_INT_FMT = "{:" + SEPARATOR + "d}"
|
|
26
|
+
|
|
27
|
+
name = const("number")
|
|
18
28
|
label = _("Number")
|
|
19
29
|
plural = _("Numbers")
|
|
20
30
|
matchable = False
|
|
@@ -22,9 +32,81 @@ class NumberType(PropertyType):
|
|
|
22
32
|
def node_id(self, value: str) -> None:
|
|
23
33
|
return None
|
|
24
34
|
|
|
35
|
+
def parse(
|
|
36
|
+
self, value: str, decimal: str = DECIMAL, separator: str = SEPARATOR
|
|
37
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
38
|
+
"""Parse a number into a numeric value and a unit. The numeric value is
|
|
39
|
+
aligned with the decimal and separator settings. The unit is stripped of
|
|
40
|
+
whitespace and returned as a string. If no unit is found, None is
|
|
41
|
+
returned. If no number is found, None is returned for both values.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
value (str): The string to parse.
|
|
45
|
+
decimal (str): The character used as the decimal separator.
|
|
46
|
+
separator (str): The character used to separate thousands, lakhs, or crores.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A tuple of (number, unit), where number is a string and unit is a string or None.
|
|
50
|
+
"""
|
|
51
|
+
value = value.replace(separator, "")
|
|
52
|
+
if decimal != self.DECIMAL:
|
|
53
|
+
value = value.replace(decimal, self.DECIMAL)
|
|
54
|
+
match = self.NUM_UNIT_RE.match(value)
|
|
55
|
+
if not match:
|
|
56
|
+
return None, None
|
|
57
|
+
number, unit = match.groups()
|
|
58
|
+
if unit is not None:
|
|
59
|
+
unit = unit.strip()
|
|
60
|
+
if len(unit) == 0:
|
|
61
|
+
unit = None
|
|
62
|
+
# TODO: We could have a lookup table for common units, e.g. kg, m, etc. to
|
|
63
|
+
# convert them to a standard form.
|
|
64
|
+
number = number.replace(" ", "")
|
|
65
|
+
if number == "":
|
|
66
|
+
number = None
|
|
67
|
+
return number, unit
|
|
68
|
+
|
|
25
69
|
def to_number(self, value: str) -> Optional[float]:
|
|
70
|
+
"""Convert a number string to a float. The string is parsed and the unit is
|
|
71
|
+
discarded if present.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
value (str): The string to convert.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Optional[float]: The parsed float value, or None if parsing fails.
|
|
78
|
+
"""
|
|
26
79
|
try:
|
|
27
|
-
|
|
28
|
-
|
|
80
|
+
number, _ = self.parse(value)
|
|
81
|
+
if number is None:
|
|
82
|
+
return None
|
|
83
|
+
return float(number)
|
|
29
84
|
except Exception:
|
|
30
85
|
return None
|
|
86
|
+
|
|
87
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
88
|
+
"""Return a caption for the number. This is used for display purposes.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
value (str): The string to format.
|
|
92
|
+
format (Optional[str]): An optional format string to use for formatting the number.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
str: The formatted number string, possibly with a unit.
|
|
96
|
+
"""
|
|
97
|
+
number, unit = self.parse(value)
|
|
98
|
+
if number is None:
|
|
99
|
+
return value
|
|
100
|
+
try:
|
|
101
|
+
fnumber = float(number)
|
|
102
|
+
except ValueError:
|
|
103
|
+
return value
|
|
104
|
+
if format is not None:
|
|
105
|
+
number = format.format(fnumber)
|
|
106
|
+
elif fnumber.is_integer():
|
|
107
|
+
number = self._INT_FMT.format(int(fnumber))
|
|
108
|
+
else:
|
|
109
|
+
number = self._FLOAT_FMT.format(fnumber)
|
|
110
|
+
if unit is not None:
|
|
111
|
+
return f"{number} {unit}"
|
|
112
|
+
return number
|
followthemoney/types/phone.py
CHANGED
|
@@ -5,9 +5,8 @@ from phonenumbers import PhoneNumber, PhoneNumberFormat
|
|
|
5
5
|
from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
|
|
6
6
|
|
|
7
7
|
from followthemoney.types.common import PropertyType
|
|
8
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
9
8
|
from followthemoney.util import defer as _
|
|
10
|
-
from followthemoney.util import dampen
|
|
9
|
+
from followthemoney.util import const, dampen
|
|
11
10
|
|
|
12
11
|
if TYPE_CHECKING:
|
|
13
12
|
from followthemoney.proxy import EntityProxy
|
|
@@ -30,8 +29,8 @@ class PhoneType(PropertyType):
|
|
|
30
29
|
validation outcome from doing the two operations the other way around. Always
|
|
31
30
|
define the country first."""
|
|
32
31
|
|
|
33
|
-
name = "phone"
|
|
34
|
-
group = "phones"
|
|
32
|
+
name = const("phone")
|
|
33
|
+
group = const("phones")
|
|
35
34
|
label = _("Phone number")
|
|
36
35
|
plural = _("Phone numbers")
|
|
37
36
|
matchable = True
|
|
@@ -97,16 +96,10 @@ class PhoneType(PropertyType):
|
|
|
97
96
|
# TODO: insert artificial intelligence here.
|
|
98
97
|
return dampen(7, 11, value)
|
|
99
98
|
|
|
100
|
-
def rdf(self, value: str) -> Identifier:
|
|
101
|
-
node_id = self.node_id(value)
|
|
102
|
-
if node_id is not None:
|
|
103
|
-
return URIRef(node_id)
|
|
104
|
-
raise ValueError("Invalid phone number for serialisation: %s" % value)
|
|
105
|
-
|
|
106
99
|
def node_id(self, value: str) -> Optional[str]:
|
|
107
100
|
return f"tel:{value}"
|
|
108
101
|
|
|
109
|
-
def caption(self, value: str) -> str:
|
|
102
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
110
103
|
try:
|
|
111
104
|
number = parse_number(value)
|
|
112
105
|
formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
|
followthemoney/types/string.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from followthemoney.types.common import PropertyType
|
|
2
|
-
from followthemoney.util import defer as _
|
|
2
|
+
from followthemoney.util import const, defer as _
|
|
3
3
|
from followthemoney.util import MEGABYTE
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class StringType(PropertyType):
|
|
7
7
|
"""A simple string property with no additional semantics."""
|
|
8
8
|
|
|
9
|
-
name = "string"
|
|
9
|
+
name = const("string")
|
|
10
10
|
label = _("Label")
|
|
11
11
|
plural = _("Labels")
|
|
12
12
|
matchable = False
|
|
@@ -21,7 +21,7 @@ class TextType(StringType):
|
|
|
21
21
|
string properties, it might make sense to treat properties of this type as
|
|
22
22
|
full-text search material."""
|
|
23
23
|
|
|
24
|
-
name = "text"
|
|
24
|
+
name = const("text")
|
|
25
25
|
label = _("Text")
|
|
26
26
|
plural = _("Texts")
|
|
27
27
|
total_size = 30 * MEGABYTE
|
|
@@ -36,7 +36,7 @@ class HTMLType(StringType):
|
|
|
36
36
|
to perform server-side sanitisation, or to not render this property at all.
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
|
-
name = "html"
|
|
39
|
+
name = const("html")
|
|
40
40
|
label = _("HTML")
|
|
41
41
|
plural = _("HTMLs")
|
|
42
42
|
total_size = 30 * MEGABYTE
|
followthemoney/types/topic.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
from babel.core import Locale
|
|
2
2
|
|
|
3
3
|
from followthemoney.types.common import EnumType, EnumValues
|
|
4
|
-
from followthemoney.
|
|
5
|
-
from followthemoney.util import gettext, defer as _
|
|
4
|
+
from followthemoney.util import const, gettext, defer as _
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class TopicType(EnumType):
|
|
@@ -16,8 +15,8 @@ class TopicType(EnumType):
|
|
|
16
15
|
enable queries such as _find all paths between a government procurement
|
|
17
16
|
award and a politician_."""
|
|
18
17
|
|
|
19
|
-
name = "topic"
|
|
20
|
-
group = "topics"
|
|
18
|
+
name = const("topic")
|
|
19
|
+
group = const("topics")
|
|
21
20
|
label = _("Topic")
|
|
22
21
|
plural = _("Topics")
|
|
23
22
|
matchable = False
|
|
@@ -90,6 +89,3 @@ class TopicType(EnumType):
|
|
|
90
89
|
|
|
91
90
|
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
92
91
|
return {k: gettext(v) for (k, v) in self._TOPICS.items()}
|
|
93
|
-
|
|
94
|
-
def rdf(self, value: str) -> Identifier:
|
|
95
|
-
return URIRef(f"ftm:topic:{value}")
|