followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. followthemoney/__init__.py +30 -10
  2. followthemoney/cli/cli.py +1 -1
  3. followthemoney/cli/exports.py +6 -2
  4. followthemoney/cli/statement.py +62 -0
  5. followthemoney/cli/util.py +2 -3
  6. followthemoney/compare.py +26 -16
  7. followthemoney/dataset/__init__.py +17 -0
  8. followthemoney/dataset/catalog.py +77 -0
  9. followthemoney/dataset/coverage.py +29 -0
  10. followthemoney/dataset/dataset.py +146 -0
  11. followthemoney/dataset/publisher.py +25 -0
  12. followthemoney/dataset/resource.py +30 -0
  13. followthemoney/dataset/util.py +55 -0
  14. followthemoney/entity.py +73 -0
  15. followthemoney/exc.py +6 -0
  16. followthemoney/export/rdf.py +57 -5
  17. followthemoney/graph.py +1 -2
  18. followthemoney/model.py +38 -11
  19. followthemoney/names.py +33 -0
  20. followthemoney/ontology.py +18 -16
  21. followthemoney/property.py +12 -15
  22. followthemoney/proxy.py +43 -64
  23. followthemoney/schema/Analyzable.yaml +2 -3
  24. followthemoney/schema/BankAccount.yaml +2 -3
  25. followthemoney/schema/Company.yaml +0 -6
  26. followthemoney/schema/Contract.yaml +0 -1
  27. followthemoney/schema/CryptoWallet.yaml +1 -1
  28. followthemoney/schema/Document.yaml +0 -6
  29. followthemoney/schema/Interval.yaml +7 -0
  30. followthemoney/schema/LegalEntity.yaml +6 -0
  31. followthemoney/schema/License.yaml +2 -0
  32. followthemoney/schema/Page.yaml +0 -1
  33. followthemoney/schema/Person.yaml +0 -5
  34. followthemoney/schema/Sanction.yaml +1 -0
  35. followthemoney/schema/Thing.yaml +0 -2
  36. followthemoney/schema/UserAccount.yaml +6 -3
  37. followthemoney/schema.py +30 -42
  38. followthemoney/statement/__init__.py +19 -0
  39. followthemoney/statement/entity.py +438 -0
  40. followthemoney/statement/serialize.py +251 -0
  41. followthemoney/statement/statement.py +256 -0
  42. followthemoney/statement/util.py +31 -0
  43. followthemoney/types/__init__.py +66 -23
  44. followthemoney/types/address.py +3 -3
  45. followthemoney/types/checksum.py +3 -7
  46. followthemoney/types/common.py +9 -14
  47. followthemoney/types/country.py +3 -7
  48. followthemoney/types/date.py +21 -11
  49. followthemoney/types/email.py +0 -4
  50. followthemoney/types/entity.py +5 -11
  51. followthemoney/types/gender.py +6 -10
  52. followthemoney/types/identifier.py +9 -3
  53. followthemoney/types/ip.py +5 -9
  54. followthemoney/types/json.py +2 -2
  55. followthemoney/types/language.py +3 -7
  56. followthemoney/types/mimetype.py +4 -8
  57. followthemoney/types/name.py +7 -8
  58. followthemoney/types/number.py +88 -6
  59. followthemoney/types/phone.py +4 -11
  60. followthemoney/types/string.py +4 -4
  61. followthemoney/types/topic.py +3 -7
  62. followthemoney/types/url.py +5 -10
  63. followthemoney/util.py +12 -13
  64. followthemoney/value.py +67 -0
  65. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
  66. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
  67. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
  68. followthemoney/offshore.py +0 -48
  69. followthemoney/rdf.py +0 -9
  70. followthemoney/schema/Assessment.yaml +0 -32
  71. followthemoney/schema/Post.yaml +0 -42
  72. followthemoney/types/iban.py +0 -58
  73. followthemoney/types/registry.py +0 -52
  74. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
  75. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
2
2
  from babel.core import Locale
3
3
 
4
4
  from followthemoney.types.common import EnumType, EnumValues
5
- from followthemoney.rdf import URIRef, Identifier
6
- from followthemoney.util import gettext, defer as _
5
+ from followthemoney.util import const, gettext, defer as _
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from followthemoney.proxy import EntityProxy
@@ -15,9 +14,9 @@ class GenderType(EnumType):
15
14
  government databases and represent it in a way that can be used by
16
15
  structured tools. I'm not sure this justifies the simplification."""
17
16
 
18
- MALE = "male"
19
- FEMALE = "female"
20
- OTHER = "other"
17
+ MALE = const("male")
18
+ FEMALE = const("female")
19
+ OTHER = const("other")
21
20
 
22
21
  LOOKUP = {
23
22
  "m": MALE,
@@ -35,8 +34,8 @@ class GenderType(EnumType):
35
34
  "divers": OTHER,
36
35
  }
37
36
 
38
- name = "gender"
39
- group = "genders"
37
+ name = const("gender")
38
+ group = const("genders")
40
39
  label = _("Gender")
41
40
  plural = _("Genders")
42
41
  matchable = False
@@ -61,6 +60,3 @@ class GenderType(EnumType):
61
60
  if code not in self.codes:
62
61
  return None
63
62
  return code
64
-
65
- def rdf(self, value: str) -> Identifier:
66
- return URIRef(f"gender:{value}")
@@ -4,7 +4,7 @@ from rigour.ids import get_identifier_format_names, get_identifier_format
4
4
 
5
5
  from followthemoney.types.common import PropertyType
6
6
  from followthemoney.util import dampen, shortest, longest
7
- from followthemoney.util import defer as _
7
+ from followthemoney.util import const, defer as _
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from followthemoney.proxy import EntityProxy
@@ -20,8 +20,8 @@ class IdentifierType(PropertyType):
20
20
  Four- or five-digit industry classifiers create more noise than value."""
21
21
 
22
22
  COMPARE_CLEAN = re.compile(r"[\W_]+")
23
- name = "identifier"
24
- group = "identifiers"
23
+ name = const("identifier")
24
+ group = const("identifiers")
25
25
  label = _("Identifier")
26
26
  plural = _("Identifiers")
27
27
  matchable = True
@@ -59,3 +59,9 @@ class IdentifierType(PropertyType):
59
59
 
60
60
  def node_id(self, value: str) -> str:
61
61
  return f"id:{value}"
62
+
63
+ def caption(self, value: str, format: Optional[str] = None) -> str:
64
+ if format in get_identifier_format_names():
65
+ format_ = get_identifier_format(format)
66
+ return format_.format(value)
67
+ return value
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
2
2
  from ipaddress import ip_address
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
- from followthemoney.rdf import URIRef, Identifier
6
- from followthemoney.util import defer as _
5
+ from followthemoney.util import const, defer as _
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from followthemoney.proxy import EntityProxy
@@ -14,10 +13,10 @@ class IpType(PropertyType):
14
13
  by the protocol versions 4 (e.g. `192.168.1.143`) and 6
15
14
  (e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
16
15
 
17
- name = "ip"
18
- group = "ips"
19
- label = _("IP-Address")
20
- plural = _("IP-Addresses")
16
+ name = const("ip")
17
+ group = const("ips")
18
+ label = _("IP Address")
19
+ plural = _("IP Addresses")
21
20
  matchable = True
22
21
  pivot = True
23
22
  max_length = 64
@@ -45,6 +44,3 @@ class IpType(PropertyType):
45
44
  return str(ip_address(text))
46
45
  except ValueError:
47
46
  return None
48
-
49
- def rdf(self, value: str) -> Identifier:
50
- return URIRef(f"ip:{value}")
@@ -3,7 +3,7 @@ from typing import Any, Optional, Sequence, TYPE_CHECKING
3
3
  from banal import ensure_list
4
4
 
5
5
  from followthemoney.types.common import PropertyType
6
- from followthemoney.util import sanitize_text, defer as _
6
+ from followthemoney.util import const, sanitize_text, defer as _
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from followthemoney.proxy import EntityProxy
@@ -14,7 +14,7 @@ class JsonType(PropertyType):
14
14
  and some other edge cases. It's a really bad idea and we should try to get rid
15
15
  of JSON properties."""
16
16
 
17
- name = "json"
17
+ name = const("json")
18
18
  group = None
19
19
  label = _("Nested data")
20
20
  plural = _("Nested data")
@@ -3,9 +3,8 @@ from babel.core import Locale
3
3
  from rigour.langs import iso_639_alpha3
4
4
 
5
5
  from followthemoney.types.common import EnumType, EnumValues
6
- from followthemoney.rdf import URIRef, Identifier
7
6
  from followthemoney.util import defer as _, gettext
8
- from followthemoney.util import get_env_list
7
+ from followthemoney.util import const, get_env_list
9
8
 
10
9
  if TYPE_CHECKING:
11
10
  from followthemoney.proxy import EntityProxy
@@ -17,8 +16,8 @@ class LanguageType(EnumType):
17
16
  for additional languages once there is a specific need for them to be
18
17
  supported."""
19
18
 
20
- name = "language"
21
- group = "languages"
19
+ name = const("language")
20
+ group = const("languages")
22
21
  label = _("Language")
23
22
  plural = _("Languages")
24
23
  matchable = False
@@ -120,6 +119,3 @@ class LanguageType(EnumType):
120
119
  if code not in self.LANGUAGES:
121
120
  return None
122
121
  return code
123
-
124
- def rdf(self, value: str) -> Identifier:
125
- return URIRef(f"iso-639:{value}")
@@ -3,8 +3,7 @@ from rigour.mime import normalize_mimetype, parse_mimetype
3
3
  from rigour.mime import DEFAULT
4
4
 
5
5
  from followthemoney.types.common import PropertyType
6
- from followthemoney.rdf import URIRef, Identifier
7
- from followthemoney.util import defer as _
6
+ from followthemoney.util import const, defer as _
8
7
 
9
8
  if TYPE_CHECKING:
10
9
  from followthemoney.proxy import EntityProxy
@@ -19,8 +18,8 @@ class MimeType(PropertyType):
19
18
  MIME type properties do not contain parameters as used in HTTP headers,
20
19
  like `charset=UTF-8`."""
21
20
 
22
- name = "mimetype"
23
- group = "mimetypes"
21
+ name = const("mimetype")
22
+ group = const("mimetypes")
24
23
  label = _("MIME-Type")
25
24
  plural = _("MIME-Types")
26
25
  matchable = False
@@ -37,8 +36,5 @@ class MimeType(PropertyType):
37
36
  return text
38
37
  return None
39
38
 
40
- def rdf(self, value: str) -> Identifier:
41
- return URIRef(f"urn:mimetype:{value}")
42
-
43
- def caption(self, value: str) -> str:
39
+ def caption(self, value: str, format: Optional[str] = None) -> str:
44
40
  return parse_mimetype(value).label or value
@@ -2,13 +2,12 @@ from typing import TYPE_CHECKING, Optional, Sequence
2
2
  from normality import slugify
3
3
  from normality.cleaning import collapse_spaces, strip_quotes
4
4
  from rigour.env import MAX_NAME_LENGTH
5
- from rigour.names import pick_name
5
+ from rigour.names import pick_name, tokenize_name
6
6
  from rigour.text.distance import levenshtein_similarity
7
- from fingerprints.cleanup import clean_name_light
8
7
 
9
8
  from followthemoney.types.common import PropertyType
10
9
  from followthemoney.util import dampen
11
- from followthemoney.util import defer as _
10
+ from followthemoney.util import const, defer as _
12
11
 
13
12
  if TYPE_CHECKING:
14
13
  from followthemoney.proxy import EntityProxy
@@ -22,8 +21,8 @@ class NameType(PropertyType):
22
21
  No validation rules apply, and things having multiple names must be considered
23
22
  a perfectly ordinary case."""
24
23
 
25
- name = "name"
26
- group = "names"
24
+ name = const("name")
25
+ group = const("names")
27
26
  label = _("Name")
28
27
  plural = _("Names")
29
28
  matchable = True
@@ -51,9 +50,9 @@ class NameType(PropertyType):
51
50
 
52
51
  def compare(self, left: str, right: str) -> float:
53
52
  """Compare two names for similarity."""
54
- left_clean = clean_name_light(left)
55
- right_clean = clean_name_light(right)
56
- if left_clean is None or right_clean is None:
53
+ left_clean = " ".join(tokenize_name(left.lower()))
54
+ right_clean = " ".join(tokenize_name(right.lower()))
55
+ if not len(left_clean) or not len(right_clean):
57
56
  return 0.0
58
57
  return levenshtein_similarity(
59
58
  left_clean,
@@ -1,8 +1,8 @@
1
1
  import re
2
- from typing import Optional
2
+ from typing import Optional, Tuple
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
- from followthemoney.util import defer as _
5
+ from followthemoney.util import const, defer as _
6
6
 
7
7
 
8
8
  class NumberType(PropertyType):
@@ -13,8 +13,18 @@ class NumberType(PropertyType):
13
13
  In the future we might want to enable annotations for format, units, or
14
14
  even to introduce a separate property type for monetary values."""
15
15
 
16
- CAST_RE = re.compile(r"[^0-9\-\.]")
17
- name = "number"
16
+ DECIMAL = "."
17
+ SEPARATOR = ","
18
+ PRECISION = 2
19
+
20
+ _NUM_UNIT_RE = (
21
+ f"(\\s?\\-?\\s?\\d+(?:{re.escape(DECIMAL)}\\d+)?)\\s*([^\\s\\d][^\\s]*)?"
22
+ )
23
+ NUM_UNIT_RE = re.compile(_NUM_UNIT_RE, re.UNICODE)
24
+ _FLOAT_FMT = "{:" + SEPARATOR + "." + str(PRECISION) + "f}"
25
+ _INT_FMT = "{:" + SEPARATOR + "d}"
26
+
27
+ name = const("number")
18
28
  label = _("Number")
19
29
  plural = _("Numbers")
20
30
  matchable = False
@@ -22,9 +32,81 @@ class NumberType(PropertyType):
22
32
  def node_id(self, value: str) -> None:
23
33
  return None
24
34
 
35
+ def parse(
36
+ self, value: str, decimal: str = DECIMAL, separator: str = SEPARATOR
37
+ ) -> Tuple[Optional[str], Optional[str]]:
38
+ """Parse a number into a numeric value and a unit. The numeric value is
39
+ aligned with the decimal and separator settings. The unit is stripped of
40
+ whitespace and returned as a string. If no unit is found, None is
41
+ returned. If no number is found, None is returned for both values.
42
+
43
+ Args:
44
+ value (str): The string to parse.
45
+ decimal (str): The character used as the decimal separator.
46
+ separator (str): The character used to separate thousands, lakhs, or crores.
47
+
48
+ Returns:
49
+ A tuple of (number, unit), where number is a string and unit is a string or None.
50
+ """
51
+ value = value.replace(separator, "")
52
+ if decimal != self.DECIMAL:
53
+ value = value.replace(decimal, self.DECIMAL)
54
+ match = self.NUM_UNIT_RE.match(value)
55
+ if not match:
56
+ return None, None
57
+ number, unit = match.groups()
58
+ if unit is not None:
59
+ unit = unit.strip()
60
+ if len(unit) == 0:
61
+ unit = None
62
+ # TODO: We could have a lookup table for common units, e.g. kg, m, etc. to
63
+ # convert them to a standard form.
64
+ number = number.replace(" ", "")
65
+ if number == "":
66
+ number = None
67
+ return number, unit
68
+
25
69
  def to_number(self, value: str) -> Optional[float]:
70
+ """Convert a number string to a float. The string is parsed and the unit is
71
+ discarded if present.
72
+
73
+ Args:
74
+ value (str): The string to convert.
75
+
76
+ Returns:
77
+ Optional[float]: The parsed float value, or None if parsing fails.
78
+ """
26
79
  try:
27
- value = self.CAST_RE.sub("", value)
28
- return float(value)
80
+ number, _ = self.parse(value)
81
+ if number is None:
82
+ return None
83
+ return float(number)
29
84
  except Exception:
30
85
  return None
86
+
87
+ def caption(self, value: str, format: Optional[str] = None) -> str:
88
+ """Return a caption for the number. This is used for display purposes.
89
+
90
+ Args:
91
+ value (str): The string to format.
92
+ format (Optional[str]): An optional format string to use for formatting the number.
93
+
94
+ Returns:
95
+ str: The formatted number string, possibly with a unit.
96
+ """
97
+ number, unit = self.parse(value)
98
+ if number is None:
99
+ return value
100
+ try:
101
+ fnumber = float(number)
102
+ except ValueError:
103
+ return value
104
+ if format is not None:
105
+ number = format.format(fnumber)
106
+ elif fnumber.is_integer():
107
+ number = self._INT_FMT.format(int(fnumber))
108
+ else:
109
+ number = self._FLOAT_FMT.format(fnumber)
110
+ if unit is not None:
111
+ return f"{number} {unit}"
112
+ return number
@@ -5,9 +5,8 @@ from phonenumbers import PhoneNumber, PhoneNumberFormat
5
5
  from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
6
6
 
7
7
  from followthemoney.types.common import PropertyType
8
- from followthemoney.rdf import URIRef, Identifier
9
8
  from followthemoney.util import defer as _
10
- from followthemoney.util import dampen
9
+ from followthemoney.util import const, dampen
11
10
 
12
11
  if TYPE_CHECKING:
13
12
  from followthemoney.proxy import EntityProxy
@@ -30,8 +29,8 @@ class PhoneType(PropertyType):
30
29
  validation outcome from doing the two operations the other way around. Always
31
30
  define the country first."""
32
31
 
33
- name = "phone"
34
- group = "phones"
32
+ name = const("phone")
33
+ group = const("phones")
35
34
  label = _("Phone number")
36
35
  plural = _("Phone numbers")
37
36
  matchable = True
@@ -97,16 +96,10 @@ class PhoneType(PropertyType):
97
96
  # TODO: insert artificial intelligence here.
98
97
  return dampen(7, 11, value)
99
98
 
100
- def rdf(self, value: str) -> Identifier:
101
- node_id = self.node_id(value)
102
- if node_id is not None:
103
- return URIRef(node_id)
104
- raise ValueError("Invalid phone number for serialisation: %s" % value)
105
-
106
99
  def node_id(self, value: str) -> Optional[str]:
107
100
  return f"tel:{value}"
108
101
 
109
- def caption(self, value: str) -> str:
102
+ def caption(self, value: str, format: Optional[str] = None) -> str:
110
103
  try:
111
104
  number = parse_number(value)
112
105
  formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
@@ -1,12 +1,12 @@
1
1
  from followthemoney.types.common import PropertyType
2
- from followthemoney.util import defer as _
2
+ from followthemoney.util import const, defer as _
3
3
  from followthemoney.util import MEGABYTE
4
4
 
5
5
 
6
6
  class StringType(PropertyType):
7
7
  """A simple string property with no additional semantics."""
8
8
 
9
- name = "string"
9
+ name = const("string")
10
10
  label = _("Label")
11
11
  plural = _("Labels")
12
12
  matchable = False
@@ -21,7 +21,7 @@ class TextType(StringType):
21
21
  string properties, it might make sense to treat properties of this type as
22
22
  full-text search material."""
23
23
 
24
- name = "text"
24
+ name = const("text")
25
25
  label = _("Text")
26
26
  plural = _("Texts")
27
27
  total_size = 30 * MEGABYTE
@@ -36,7 +36,7 @@ class HTMLType(StringType):
36
36
  to perform server-side sanitisation, or to not render this property at all.
37
37
  """
38
38
 
39
- name = "html"
39
+ name = const("html")
40
40
  label = _("HTML")
41
41
  plural = _("HTMLs")
42
42
  total_size = 30 * MEGABYTE
@@ -1,8 +1,7 @@
1
1
  from babel.core import Locale
2
2
 
3
3
  from followthemoney.types.common import EnumType, EnumValues
4
- from followthemoney.rdf import URIRef, Identifier
5
- from followthemoney.util import gettext, defer as _
4
+ from followthemoney.util import const, gettext, defer as _
6
5
 
7
6
 
8
7
  class TopicType(EnumType):
@@ -16,8 +15,8 @@ class TopicType(EnumType):
16
15
  enable queries such as _find all paths between a government procurement
17
16
  award and a politician_."""
18
17
 
19
- name = "topic"
20
- group = "topics"
18
+ name = const("topic")
19
+ group = const("topics")
21
20
  label = _("Topic")
22
21
  plural = _("Topics")
23
22
  matchable = False
@@ -90,6 +89,3 @@ class TopicType(EnumType):
90
89
 
91
90
  def _locale_names(self, locale: Locale) -> EnumValues:
92
91
  return {k: gettext(v) for (k, v) in self._TOPICS.items()}
93
-
94
- def rdf(self, value: str) -> Identifier:
95
- return URIRef(f"ftm:topic:{value}")
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
2
2
  from rigour.urls import clean_url, compare_urls
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
- from followthemoney.rdf import URIRef, Identifier
6
- from followthemoney.util import dampen, defer as _
5
+ from followthemoney.util import const, dampen, defer as _
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from followthemoney.proxy import EntityProxy
@@ -17,8 +16,8 @@ class UrlType(PropertyType):
17
16
  SCHEMES = ("http", "https", "ftp", "mailto")
18
17
  DEFAULT_SCHEME = "http"
19
18
 
20
- name = "url"
21
- group = "urls"
19
+ name = const("url")
20
+ group = const("urls")
22
21
  label = _("URL")
23
22
  plural = _("URLs")
24
23
  matchable = True
@@ -37,13 +36,9 @@ class UrlType(PropertyType):
37
36
  return clean_url(text)
38
37
 
39
38
  def compare(self, left: str, right: str) -> float:
39
+ """Compare two URLs and return a float indicating how similar they are. This ignores
40
+ fragments and peforms hard URL normalisation."""
40
41
  return compare_urls(left, right)
41
42
 
42
43
  def _specificity(self, value: str) -> float:
43
44
  return dampen(10, 120, value)
44
-
45
- def rdf(self, value: str) -> Identifier:
46
- return URIRef(value)
47
-
48
- def node_id(self, value: str) -> Optional[str]:
49
- return f"url:{value}"
followthemoney/util.py CHANGED
@@ -1,11 +1,12 @@
1
1
  import os
2
+ import sys
2
3
  import logging
3
4
  from hashlib import sha1
4
5
  from babel import Locale
5
6
  from gettext import translation
6
7
 
7
8
  from threading import local
8
- from typing import cast, Dict, Any, List, Optional, TypeVar, Union, Sequence
9
+ from typing import cast, Dict, Any, List, Optional, TypeVar, Union
9
10
  from normality import stringify
10
11
  from normality.cleaning import compose_nfc
11
12
  from normality.cleaning import remove_unsafe_chars
@@ -36,6 +37,11 @@ def defer(text: str) -> str:
36
37
  return text
37
38
 
38
39
 
40
+ def const(text: str) -> str:
41
+ """Convert the given text to a runtime constant."""
42
+ return sys.intern(text.strip())
43
+
44
+
39
45
  def set_model_locale(locale: Locale) -> None:
40
46
  state.locale = locale
41
47
  state.translation = translation(
@@ -58,12 +64,14 @@ def get_env_list(name: str, default: List[str] = []) -> List[str]:
58
64
  return default
59
65
 
60
66
 
61
- def sanitize_text(text: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
62
- text = stringify(text, encoding_default=encoding)
67
+ def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
68
+ text = stringify(value, encoding_default=encoding)
63
69
  if text is None:
64
70
  return None
65
71
  try:
66
72
  text = compose_nfc(text)
73
+ if text is None:
74
+ return None
67
75
  except (SystemError, Exception) as ex:
68
76
  log.warning("Cannot NFC text: %s", ex)
69
77
  return None
@@ -71,16 +79,7 @@ def sanitize_text(text: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
71
79
  if text is None:
72
80
  return None
73
81
  byte_text = text.encode(DEFAULT_ENCODING, "replace")
74
- return cast(str, byte_text.decode(DEFAULT_ENCODING, "replace"))
75
-
76
-
77
- def value_list(value: Union[T, Sequence[T]]) -> List[T]:
78
- if not isinstance(value, (str, bytes)):
79
- try:
80
- return [v for v in cast(Sequence[T], value)]
81
- except TypeError:
82
- pass
83
- return [cast(T, value)]
82
+ return byte_text.decode(DEFAULT_ENCODING, "replace")
84
83
 
85
84
 
86
85
  def key_bytes(key: Any) -> bytes:
@@ -0,0 +1,67 @@
1
+ from typing import Any, Iterable, List, Mapping, Union
2
+ from datetime import datetime, date, timezone
3
+ import typing
4
+ from prefixdate import DatePrefix
5
+
6
+ from followthemoney.util import sanitize_text
7
+
8
+ if typing.TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
10
+
11
+ Value = Union[str, int, float, bool, date, datetime, DatePrefix, None, "EntityProxy"]
12
+ Values = Union[Value, Iterable[Value]]
13
+
14
+
15
+ def string_list(value: Any, sanitize: bool = False) -> List[str]:
16
+ """Convert a value - which may be a list or set - to a list of strings."""
17
+ # This function is called in the inner loop of placing values into entities,
18
+ # so it's unrolled to avoid the overhead of a comparatively heavy ops like
19
+ # `isinstance`.
20
+ if value is None:
21
+ return []
22
+ type_ = type(value)
23
+ if type_ is str:
24
+ if sanitize:
25
+ value = sanitize_text(value)
26
+ if value is None:
27
+ return []
28
+ return [value] if len(value) > 0 else []
29
+ if type_ is int:
30
+ return [str(value)]
31
+ if type_ is float:
32
+ return [f"{value:.2f}"]
33
+ if type_ is bool:
34
+ return ["true" if value else "false"]
35
+ if type_ is date:
36
+ return [value.isoformat()]
37
+ if type_ is datetime:
38
+ if value.tzinfo is not None:
39
+ value = value.astimezone(tz=timezone.utc)
40
+ return [value.isoformat()]
41
+ if type_ is set or type_ is list or type_ is tuple:
42
+ texts: List[str] = []
43
+ for inner in value:
44
+ texts.extend(string_list(inner, sanitize=sanitize))
45
+ return texts
46
+ if isinstance(value, DatePrefix):
47
+ return [value.text] if value.text else []
48
+ # EntityProxy
49
+ try:
50
+ return string_list(value.id, sanitize=sanitize)
51
+ except AttributeError:
52
+ pass
53
+ # Entity dict
54
+ if isinstance(value, Mapping):
55
+ return string_list(value.get("id"), sanitize=sanitize)
56
+ if isinstance(value, (str, bytes)):
57
+ # Handle sub-classes of str, bytes - always sanitize
58
+ text = sanitize_text(value)
59
+ if text is None:
60
+ return []
61
+ return [text]
62
+ if isinstance(value, Iterable):
63
+ stexts: List[str] = []
64
+ for inner in value:
65
+ stexts.extend(string_list(inner, sanitize=sanitize))
66
+ return stexts
67
+ raise TypeError("Cannot convert %r to string list" % value)