followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +146 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +55 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +1 -2
- followthemoney/model.py +38 -11
- followthemoney/names.py +33 -0
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +43 -64
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +30 -42
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +438 -0
- followthemoney/statement/serialize.py +251 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
followthemoney/types/gender.py
CHANGED
|
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from babel.core import Locale
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import EnumType, EnumValues
|
|
5
|
-
from followthemoney.
|
|
6
|
-
from followthemoney.util import gettext, defer as _
|
|
5
|
+
from followthemoney.util import const, gettext, defer as _
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -15,9 +14,9 @@ class GenderType(EnumType):
|
|
|
15
14
|
government databases and represent it in a way that can be used by
|
|
16
15
|
structured tools. I'm not sure this justifies the simplification."""
|
|
17
16
|
|
|
18
|
-
MALE = "male"
|
|
19
|
-
FEMALE = "female"
|
|
20
|
-
OTHER = "other"
|
|
17
|
+
MALE = const("male")
|
|
18
|
+
FEMALE = const("female")
|
|
19
|
+
OTHER = const("other")
|
|
21
20
|
|
|
22
21
|
LOOKUP = {
|
|
23
22
|
"m": MALE,
|
|
@@ -35,8 +34,8 @@ class GenderType(EnumType):
|
|
|
35
34
|
"divers": OTHER,
|
|
36
35
|
}
|
|
37
36
|
|
|
38
|
-
name = "gender"
|
|
39
|
-
group = "genders"
|
|
37
|
+
name = const("gender")
|
|
38
|
+
group = const("genders")
|
|
40
39
|
label = _("Gender")
|
|
41
40
|
plural = _("Genders")
|
|
42
41
|
matchable = False
|
|
@@ -61,6 +60,3 @@ class GenderType(EnumType):
|
|
|
61
60
|
if code not in self.codes:
|
|
62
61
|
return None
|
|
63
62
|
return code
|
|
64
|
-
|
|
65
|
-
def rdf(self, value: str) -> Identifier:
|
|
66
|
-
return URIRef(f"gender:{value}")
|
|
@@ -4,7 +4,7 @@ from rigour.ids import get_identifier_format_names, get_identifier_format
|
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
6
|
from followthemoney.util import dampen, shortest, longest
|
|
7
|
-
from followthemoney.util import defer as _
|
|
7
|
+
from followthemoney.util import const, defer as _
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -20,8 +20,8 @@ class IdentifierType(PropertyType):
|
|
|
20
20
|
Four- or five-digit industry classifiers create more noise than value."""
|
|
21
21
|
|
|
22
22
|
COMPARE_CLEAN = re.compile(r"[\W_]+")
|
|
23
|
-
name = "identifier"
|
|
24
|
-
group = "identifiers"
|
|
23
|
+
name = const("identifier")
|
|
24
|
+
group = const("identifiers")
|
|
25
25
|
label = _("Identifier")
|
|
26
26
|
plural = _("Identifiers")
|
|
27
27
|
matchable = True
|
|
@@ -59,3 +59,9 @@ class IdentifierType(PropertyType):
|
|
|
59
59
|
|
|
60
60
|
def node_id(self, value: str) -> str:
|
|
61
61
|
return f"id:{value}"
|
|
62
|
+
|
|
63
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
64
|
+
if format in get_identifier_format_names():
|
|
65
|
+
format_ = get_identifier_format(format)
|
|
66
|
+
return format_.format(value)
|
|
67
|
+
return value
|
followthemoney/types/ip.py
CHANGED
|
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from ipaddress import ip_address
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.
|
|
6
|
-
from followthemoney.util import defer as _
|
|
5
|
+
from followthemoney.util import const, defer as _
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -14,10 +13,10 @@ class IpType(PropertyType):
|
|
|
14
13
|
by the protocol versions 4 (e.g. `192.168.1.143`) and 6
|
|
15
14
|
(e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
|
|
16
15
|
|
|
17
|
-
name = "ip"
|
|
18
|
-
group = "ips"
|
|
19
|
-
label = _("IP
|
|
20
|
-
plural = _("IP
|
|
16
|
+
name = const("ip")
|
|
17
|
+
group = const("ips")
|
|
18
|
+
label = _("IP Address")
|
|
19
|
+
plural = _("IP Addresses")
|
|
21
20
|
matchable = True
|
|
22
21
|
pivot = True
|
|
23
22
|
max_length = 64
|
|
@@ -45,6 +44,3 @@ class IpType(PropertyType):
|
|
|
45
44
|
return str(ip_address(text))
|
|
46
45
|
except ValueError:
|
|
47
46
|
return None
|
|
48
|
-
|
|
49
|
-
def rdf(self, value: str) -> Identifier:
|
|
50
|
-
return URIRef(f"ip:{value}")
|
followthemoney/types/json.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Any, Optional, Sequence, TYPE_CHECKING
|
|
|
3
3
|
from banal import ensure_list
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
-
from followthemoney.util import sanitize_text, defer as _
|
|
6
|
+
from followthemoney.util import const, sanitize_text, defer as _
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from followthemoney.proxy import EntityProxy
|
|
@@ -14,7 +14,7 @@ class JsonType(PropertyType):
|
|
|
14
14
|
and some other edge cases. It's a really bad idea and we should try to get rid
|
|
15
15
|
of JSON properties."""
|
|
16
16
|
|
|
17
|
-
name = "json"
|
|
17
|
+
name = const("json")
|
|
18
18
|
group = None
|
|
19
19
|
label = _("Nested data")
|
|
20
20
|
plural = _("Nested data")
|
followthemoney/types/language.py
CHANGED
|
@@ -3,9 +3,8 @@ from babel.core import Locale
|
|
|
3
3
|
from rigour.langs import iso_639_alpha3
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import EnumType, EnumValues
|
|
6
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
7
6
|
from followthemoney.util import defer as _, gettext
|
|
8
|
-
from followthemoney.util import get_env_list
|
|
7
|
+
from followthemoney.util import const, get_env_list
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from followthemoney.proxy import EntityProxy
|
|
@@ -17,8 +16,8 @@ class LanguageType(EnumType):
|
|
|
17
16
|
for additional languages once there is a specific need for them to be
|
|
18
17
|
supported."""
|
|
19
18
|
|
|
20
|
-
name = "language"
|
|
21
|
-
group = "languages"
|
|
19
|
+
name = const("language")
|
|
20
|
+
group = const("languages")
|
|
22
21
|
label = _("Language")
|
|
23
22
|
plural = _("Languages")
|
|
24
23
|
matchable = False
|
|
@@ -120,6 +119,3 @@ class LanguageType(EnumType):
|
|
|
120
119
|
if code not in self.LANGUAGES:
|
|
121
120
|
return None
|
|
122
121
|
return code
|
|
123
|
-
|
|
124
|
-
def rdf(self, value: str) -> Identifier:
|
|
125
|
-
return URIRef(f"iso-639:{value}")
|
followthemoney/types/mimetype.py
CHANGED
|
@@ -3,8 +3,7 @@ from rigour.mime import normalize_mimetype, parse_mimetype
|
|
|
3
3
|
from rigour.mime import DEFAULT
|
|
4
4
|
|
|
5
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
-
from followthemoney.
|
|
7
|
-
from followthemoney.util import defer as _
|
|
6
|
+
from followthemoney.util import const, defer as _
|
|
8
7
|
|
|
9
8
|
if TYPE_CHECKING:
|
|
10
9
|
from followthemoney.proxy import EntityProxy
|
|
@@ -19,8 +18,8 @@ class MimeType(PropertyType):
|
|
|
19
18
|
MIME type properties do not contain parameters as used in HTTP headers,
|
|
20
19
|
like `charset=UTF-8`."""
|
|
21
20
|
|
|
22
|
-
name = "mimetype"
|
|
23
|
-
group = "mimetypes"
|
|
21
|
+
name = const("mimetype")
|
|
22
|
+
group = const("mimetypes")
|
|
24
23
|
label = _("MIME-Type")
|
|
25
24
|
plural = _("MIME-Types")
|
|
26
25
|
matchable = False
|
|
@@ -37,8 +36,5 @@ class MimeType(PropertyType):
|
|
|
37
36
|
return text
|
|
38
37
|
return None
|
|
39
38
|
|
|
40
|
-
def
|
|
41
|
-
return URIRef(f"urn:mimetype:{value}")
|
|
42
|
-
|
|
43
|
-
def caption(self, value: str) -> str:
|
|
39
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
44
40
|
return parse_mimetype(value).label or value
|
followthemoney/types/name.py
CHANGED
|
@@ -2,13 +2,12 @@ from typing import TYPE_CHECKING, Optional, Sequence
|
|
|
2
2
|
from normality import slugify
|
|
3
3
|
from normality.cleaning import collapse_spaces, strip_quotes
|
|
4
4
|
from rigour.env import MAX_NAME_LENGTH
|
|
5
|
-
from rigour.names import pick_name
|
|
5
|
+
from rigour.names import pick_name, tokenize_name
|
|
6
6
|
from rigour.text.distance import levenshtein_similarity
|
|
7
|
-
from fingerprints.cleanup import clean_name_light
|
|
8
7
|
|
|
9
8
|
from followthemoney.types.common import PropertyType
|
|
10
9
|
from followthemoney.util import dampen
|
|
11
|
-
from followthemoney.util import defer as _
|
|
10
|
+
from followthemoney.util import const, defer as _
|
|
12
11
|
|
|
13
12
|
if TYPE_CHECKING:
|
|
14
13
|
from followthemoney.proxy import EntityProxy
|
|
@@ -22,8 +21,8 @@ class NameType(PropertyType):
|
|
|
22
21
|
No validation rules apply, and things having multiple names must be considered
|
|
23
22
|
a perfectly ordinary case."""
|
|
24
23
|
|
|
25
|
-
name = "name"
|
|
26
|
-
group = "names"
|
|
24
|
+
name = const("name")
|
|
25
|
+
group = const("names")
|
|
27
26
|
label = _("Name")
|
|
28
27
|
plural = _("Names")
|
|
29
28
|
matchable = True
|
|
@@ -51,9 +50,9 @@ class NameType(PropertyType):
|
|
|
51
50
|
|
|
52
51
|
def compare(self, left: str, right: str) -> float:
|
|
53
52
|
"""Compare two names for similarity."""
|
|
54
|
-
left_clean =
|
|
55
|
-
right_clean =
|
|
56
|
-
if left_clean
|
|
53
|
+
left_clean = " ".join(tokenize_name(left.lower()))
|
|
54
|
+
right_clean = " ".join(tokenize_name(right.lower()))
|
|
55
|
+
if not len(left_clean) or not len(right_clean):
|
|
57
56
|
return 0.0
|
|
58
57
|
return levenshtein_similarity(
|
|
59
58
|
left_clean,
|
followthemoney/types/number.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.util import defer as _
|
|
5
|
+
from followthemoney.util import const, defer as _
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class NumberType(PropertyType):
|
|
@@ -13,8 +13,18 @@ class NumberType(PropertyType):
|
|
|
13
13
|
In the future we might want to enable annotations for format, units, or
|
|
14
14
|
even to introduce a separate property type for monetary values."""
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
DECIMAL = "."
|
|
17
|
+
SEPARATOR = ","
|
|
18
|
+
PRECISION = 2
|
|
19
|
+
|
|
20
|
+
_NUM_UNIT_RE = (
|
|
21
|
+
f"(\\s?\\-?\\s?\\d+(?:{re.escape(DECIMAL)}\\d+)?)\\s*([^\\s\\d][^\\s]*)?"
|
|
22
|
+
)
|
|
23
|
+
NUM_UNIT_RE = re.compile(_NUM_UNIT_RE, re.UNICODE)
|
|
24
|
+
_FLOAT_FMT = "{:" + SEPARATOR + "." + str(PRECISION) + "f}"
|
|
25
|
+
_INT_FMT = "{:" + SEPARATOR + "d}"
|
|
26
|
+
|
|
27
|
+
name = const("number")
|
|
18
28
|
label = _("Number")
|
|
19
29
|
plural = _("Numbers")
|
|
20
30
|
matchable = False
|
|
@@ -22,9 +32,81 @@ class NumberType(PropertyType):
|
|
|
22
32
|
def node_id(self, value: str) -> None:
|
|
23
33
|
return None
|
|
24
34
|
|
|
35
|
+
def parse(
|
|
36
|
+
self, value: str, decimal: str = DECIMAL, separator: str = SEPARATOR
|
|
37
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
38
|
+
"""Parse a number into a numeric value and a unit. The numeric value is
|
|
39
|
+
aligned with the decimal and separator settings. The unit is stripped of
|
|
40
|
+
whitespace and returned as a string. If no unit is found, None is
|
|
41
|
+
returned. If no number is found, None is returned for both values.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
value (str): The string to parse.
|
|
45
|
+
decimal (str): The character used as the decimal separator.
|
|
46
|
+
separator (str): The character used to separate thousands, lakhs, or crores.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A tuple of (number, unit), where number is a string and unit is a string or None.
|
|
50
|
+
"""
|
|
51
|
+
value = value.replace(separator, "")
|
|
52
|
+
if decimal != self.DECIMAL:
|
|
53
|
+
value = value.replace(decimal, self.DECIMAL)
|
|
54
|
+
match = self.NUM_UNIT_RE.match(value)
|
|
55
|
+
if not match:
|
|
56
|
+
return None, None
|
|
57
|
+
number, unit = match.groups()
|
|
58
|
+
if unit is not None:
|
|
59
|
+
unit = unit.strip()
|
|
60
|
+
if len(unit) == 0:
|
|
61
|
+
unit = None
|
|
62
|
+
# TODO: We could have a lookup table for common units, e.g. kg, m, etc. to
|
|
63
|
+
# convert them to a standard form.
|
|
64
|
+
number = number.replace(" ", "")
|
|
65
|
+
if number == "":
|
|
66
|
+
number = None
|
|
67
|
+
return number, unit
|
|
68
|
+
|
|
25
69
|
def to_number(self, value: str) -> Optional[float]:
|
|
70
|
+
"""Convert a number string to a float. The string is parsed and the unit is
|
|
71
|
+
discarded if present.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
value (str): The string to convert.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Optional[float]: The parsed float value, or None if parsing fails.
|
|
78
|
+
"""
|
|
26
79
|
try:
|
|
27
|
-
|
|
28
|
-
|
|
80
|
+
number, _ = self.parse(value)
|
|
81
|
+
if number is None:
|
|
82
|
+
return None
|
|
83
|
+
return float(number)
|
|
29
84
|
except Exception:
|
|
30
85
|
return None
|
|
86
|
+
|
|
87
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
88
|
+
"""Return a caption for the number. This is used for display purposes.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
value (str): The string to format.
|
|
92
|
+
format (Optional[str]): An optional format string to use for formatting the number.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
str: The formatted number string, possibly with a unit.
|
|
96
|
+
"""
|
|
97
|
+
number, unit = self.parse(value)
|
|
98
|
+
if number is None:
|
|
99
|
+
return value
|
|
100
|
+
try:
|
|
101
|
+
fnumber = float(number)
|
|
102
|
+
except ValueError:
|
|
103
|
+
return value
|
|
104
|
+
if format is not None:
|
|
105
|
+
number = format.format(fnumber)
|
|
106
|
+
elif fnumber.is_integer():
|
|
107
|
+
number = self._INT_FMT.format(int(fnumber))
|
|
108
|
+
else:
|
|
109
|
+
number = self._FLOAT_FMT.format(fnumber)
|
|
110
|
+
if unit is not None:
|
|
111
|
+
return f"{number} {unit}"
|
|
112
|
+
return number
|
followthemoney/types/phone.py
CHANGED
|
@@ -5,9 +5,8 @@ from phonenumbers import PhoneNumber, PhoneNumberFormat
|
|
|
5
5
|
from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
|
|
6
6
|
|
|
7
7
|
from followthemoney.types.common import PropertyType
|
|
8
|
-
from followthemoney.rdf import URIRef, Identifier
|
|
9
8
|
from followthemoney.util import defer as _
|
|
10
|
-
from followthemoney.util import dampen
|
|
9
|
+
from followthemoney.util import const, dampen
|
|
11
10
|
|
|
12
11
|
if TYPE_CHECKING:
|
|
13
12
|
from followthemoney.proxy import EntityProxy
|
|
@@ -30,8 +29,8 @@ class PhoneType(PropertyType):
|
|
|
30
29
|
validation outcome from doing the two operations the other way around. Always
|
|
31
30
|
define the country first."""
|
|
32
31
|
|
|
33
|
-
name = "phone"
|
|
34
|
-
group = "phones"
|
|
32
|
+
name = const("phone")
|
|
33
|
+
group = const("phones")
|
|
35
34
|
label = _("Phone number")
|
|
36
35
|
plural = _("Phone numbers")
|
|
37
36
|
matchable = True
|
|
@@ -97,16 +96,10 @@ class PhoneType(PropertyType):
|
|
|
97
96
|
# TODO: insert artificial intelligence here.
|
|
98
97
|
return dampen(7, 11, value)
|
|
99
98
|
|
|
100
|
-
def rdf(self, value: str) -> Identifier:
|
|
101
|
-
node_id = self.node_id(value)
|
|
102
|
-
if node_id is not None:
|
|
103
|
-
return URIRef(node_id)
|
|
104
|
-
raise ValueError("Invalid phone number for serialisation: %s" % value)
|
|
105
|
-
|
|
106
99
|
def node_id(self, value: str) -> Optional[str]:
|
|
107
100
|
return f"tel:{value}"
|
|
108
101
|
|
|
109
|
-
def caption(self, value: str) -> str:
|
|
102
|
+
def caption(self, value: str, format: Optional[str] = None) -> str:
|
|
110
103
|
try:
|
|
111
104
|
number = parse_number(value)
|
|
112
105
|
formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
|
followthemoney/types/string.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from followthemoney.types.common import PropertyType
|
|
2
|
-
from followthemoney.util import defer as _
|
|
2
|
+
from followthemoney.util import const, defer as _
|
|
3
3
|
from followthemoney.util import MEGABYTE
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class StringType(PropertyType):
|
|
7
7
|
"""A simple string property with no additional semantics."""
|
|
8
8
|
|
|
9
|
-
name = "string"
|
|
9
|
+
name = const("string")
|
|
10
10
|
label = _("Label")
|
|
11
11
|
plural = _("Labels")
|
|
12
12
|
matchable = False
|
|
@@ -21,7 +21,7 @@ class TextType(StringType):
|
|
|
21
21
|
string properties, it might make sense to treat properties of this type as
|
|
22
22
|
full-text search material."""
|
|
23
23
|
|
|
24
|
-
name = "text"
|
|
24
|
+
name = const("text")
|
|
25
25
|
label = _("Text")
|
|
26
26
|
plural = _("Texts")
|
|
27
27
|
total_size = 30 * MEGABYTE
|
|
@@ -36,7 +36,7 @@ class HTMLType(StringType):
|
|
|
36
36
|
to perform server-side sanitisation, or to not render this property at all.
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
|
-
name = "html"
|
|
39
|
+
name = const("html")
|
|
40
40
|
label = _("HTML")
|
|
41
41
|
plural = _("HTMLs")
|
|
42
42
|
total_size = 30 * MEGABYTE
|
followthemoney/types/topic.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
from babel.core import Locale
|
|
2
2
|
|
|
3
3
|
from followthemoney.types.common import EnumType, EnumValues
|
|
4
|
-
from followthemoney.
|
|
5
|
-
from followthemoney.util import gettext, defer as _
|
|
4
|
+
from followthemoney.util import const, gettext, defer as _
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class TopicType(EnumType):
|
|
@@ -16,8 +15,8 @@ class TopicType(EnumType):
|
|
|
16
15
|
enable queries such as _find all paths between a government procurement
|
|
17
16
|
award and a politician_."""
|
|
18
17
|
|
|
19
|
-
name = "topic"
|
|
20
|
-
group = "topics"
|
|
18
|
+
name = const("topic")
|
|
19
|
+
group = const("topics")
|
|
21
20
|
label = _("Topic")
|
|
22
21
|
plural = _("Topics")
|
|
23
22
|
matchable = False
|
|
@@ -90,6 +89,3 @@ class TopicType(EnumType):
|
|
|
90
89
|
|
|
91
90
|
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
92
91
|
return {k: gettext(v) for (k, v) in self._TOPICS.items()}
|
|
93
|
-
|
|
94
|
-
def rdf(self, value: str) -> Identifier:
|
|
95
|
-
return URIRef(f"ftm:topic:{value}")
|
followthemoney/types/url.py
CHANGED
|
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
|
|
|
2
2
|
from rigour.urls import clean_url, compare_urls
|
|
3
3
|
|
|
4
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
-
from followthemoney.
|
|
6
|
-
from followthemoney.util import dampen, defer as _
|
|
5
|
+
from followthemoney.util import const, dampen, defer as _
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from followthemoney.proxy import EntityProxy
|
|
@@ -17,8 +16,8 @@ class UrlType(PropertyType):
|
|
|
17
16
|
SCHEMES = ("http", "https", "ftp", "mailto")
|
|
18
17
|
DEFAULT_SCHEME = "http"
|
|
19
18
|
|
|
20
|
-
name = "url"
|
|
21
|
-
group = "urls"
|
|
19
|
+
name = const("url")
|
|
20
|
+
group = const("urls")
|
|
22
21
|
label = _("URL")
|
|
23
22
|
plural = _("URLs")
|
|
24
23
|
matchable = True
|
|
@@ -37,13 +36,9 @@ class UrlType(PropertyType):
|
|
|
37
36
|
return clean_url(text)
|
|
38
37
|
|
|
39
38
|
def compare(self, left: str, right: str) -> float:
|
|
39
|
+
"""Compare two URLs and return a float indicating how similar they are. This ignores
|
|
40
|
+
fragments and peforms hard URL normalisation."""
|
|
40
41
|
return compare_urls(left, right)
|
|
41
42
|
|
|
42
43
|
def _specificity(self, value: str) -> float:
|
|
43
44
|
return dampen(10, 120, value)
|
|
44
|
-
|
|
45
|
-
def rdf(self, value: str) -> Identifier:
|
|
46
|
-
return URIRef(value)
|
|
47
|
-
|
|
48
|
-
def node_id(self, value: str) -> Optional[str]:
|
|
49
|
-
return f"url:{value}"
|
followthemoney/util.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import sys
|
|
2
3
|
import logging
|
|
3
4
|
from hashlib import sha1
|
|
4
5
|
from babel import Locale
|
|
5
6
|
from gettext import translation
|
|
6
7
|
|
|
7
8
|
from threading import local
|
|
8
|
-
from typing import cast, Dict, Any, List, Optional, TypeVar, Union
|
|
9
|
+
from typing import cast, Dict, Any, List, Optional, TypeVar, Union
|
|
9
10
|
from normality import stringify
|
|
10
11
|
from normality.cleaning import compose_nfc
|
|
11
12
|
from normality.cleaning import remove_unsafe_chars
|
|
@@ -36,6 +37,11 @@ def defer(text: str) -> str:
|
|
|
36
37
|
return text
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def const(text: str) -> str:
|
|
41
|
+
"""Convert the given text to a runtime constant."""
|
|
42
|
+
return sys.intern(text.strip())
|
|
43
|
+
|
|
44
|
+
|
|
39
45
|
def set_model_locale(locale: Locale) -> None:
|
|
40
46
|
state.locale = locale
|
|
41
47
|
state.translation = translation(
|
|
@@ -58,12 +64,14 @@ def get_env_list(name: str, default: List[str] = []) -> List[str]:
|
|
|
58
64
|
return default
|
|
59
65
|
|
|
60
66
|
|
|
61
|
-
def sanitize_text(
|
|
62
|
-
text = stringify(
|
|
67
|
+
def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
|
|
68
|
+
text = stringify(value, encoding_default=encoding)
|
|
63
69
|
if text is None:
|
|
64
70
|
return None
|
|
65
71
|
try:
|
|
66
72
|
text = compose_nfc(text)
|
|
73
|
+
if text is None:
|
|
74
|
+
return None
|
|
67
75
|
except (SystemError, Exception) as ex:
|
|
68
76
|
log.warning("Cannot NFC text: %s", ex)
|
|
69
77
|
return None
|
|
@@ -71,16 +79,7 @@ def sanitize_text(text: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
|
|
|
71
79
|
if text is None:
|
|
72
80
|
return None
|
|
73
81
|
byte_text = text.encode(DEFAULT_ENCODING, "replace")
|
|
74
|
-
return
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def value_list(value: Union[T, Sequence[T]]) -> List[T]:
|
|
78
|
-
if not isinstance(value, (str, bytes)):
|
|
79
|
-
try:
|
|
80
|
-
return [v for v in cast(Sequence[T], value)]
|
|
81
|
-
except TypeError:
|
|
82
|
-
pass
|
|
83
|
-
return [cast(T, value)]
|
|
82
|
+
return byte_text.decode(DEFAULT_ENCODING, "replace")
|
|
84
83
|
|
|
85
84
|
|
|
86
85
|
def key_bytes(key: Any) -> bytes:
|
followthemoney/value.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Any, Iterable, List, Mapping, Union
|
|
2
|
+
from datetime import datetime, date, timezone
|
|
3
|
+
import typing
|
|
4
|
+
from prefixdate import DatePrefix
|
|
5
|
+
|
|
6
|
+
from followthemoney.util import sanitize_text
|
|
7
|
+
|
|
8
|
+
if typing.TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
10
|
+
|
|
11
|
+
Value = Union[str, int, float, bool, date, datetime, DatePrefix, None, "EntityProxy"]
|
|
12
|
+
Values = Union[Value, Iterable[Value]]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def string_list(value: Any, sanitize: bool = False) -> List[str]:
|
|
16
|
+
"""Convert a value - which may be a list or set - to a list of strings."""
|
|
17
|
+
# This function is called in the inner loop of placing values into entities,
|
|
18
|
+
# so it's unrolled to avoid the overhead of a comparatively heavy ops like
|
|
19
|
+
# `isinstance`.
|
|
20
|
+
if value is None:
|
|
21
|
+
return []
|
|
22
|
+
type_ = type(value)
|
|
23
|
+
if type_ is str:
|
|
24
|
+
if sanitize:
|
|
25
|
+
value = sanitize_text(value)
|
|
26
|
+
if value is None:
|
|
27
|
+
return []
|
|
28
|
+
return [value] if len(value) > 0 else []
|
|
29
|
+
if type_ is int:
|
|
30
|
+
return [str(value)]
|
|
31
|
+
if type_ is float:
|
|
32
|
+
return [f"{value:.2f}"]
|
|
33
|
+
if type_ is bool:
|
|
34
|
+
return ["true" if value else "false"]
|
|
35
|
+
if type_ is date:
|
|
36
|
+
return [value.isoformat()]
|
|
37
|
+
if type_ is datetime:
|
|
38
|
+
if value.tzinfo is not None:
|
|
39
|
+
value = value.astimezone(tz=timezone.utc)
|
|
40
|
+
return [value.isoformat()]
|
|
41
|
+
if type_ is set or type_ is list or type_ is tuple:
|
|
42
|
+
texts: List[str] = []
|
|
43
|
+
for inner in value:
|
|
44
|
+
texts.extend(string_list(inner, sanitize=sanitize))
|
|
45
|
+
return texts
|
|
46
|
+
if isinstance(value, DatePrefix):
|
|
47
|
+
return [value.text] if value.text else []
|
|
48
|
+
# EntityProxy
|
|
49
|
+
try:
|
|
50
|
+
return string_list(value.id, sanitize=sanitize)
|
|
51
|
+
except AttributeError:
|
|
52
|
+
pass
|
|
53
|
+
# Entity dict
|
|
54
|
+
if isinstance(value, Mapping):
|
|
55
|
+
return string_list(value.get("id"), sanitize=sanitize)
|
|
56
|
+
if isinstance(value, (str, bytes)):
|
|
57
|
+
# Handle sub-classes of str, bytes - always sanitize
|
|
58
|
+
text = sanitize_text(value)
|
|
59
|
+
if text is None:
|
|
60
|
+
return []
|
|
61
|
+
return [text]
|
|
62
|
+
if isinstance(value, Iterable):
|
|
63
|
+
stexts: List[str] = []
|
|
64
|
+
for inner in value:
|
|
65
|
+
stexts.extend(string_list(inner, sanitize=sanitize))
|
|
66
|
+
return stexts
|
|
67
|
+
raise TypeError("Cannot convert %r to string list" % value)
|