followthemoney 3.8.4__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. followthemoney/__init__.py +30 -10
  2. followthemoney/cli/__init__.py +3 -12
  3. followthemoney/cli/aggregate.py +1 -1
  4. followthemoney/cli/cli.py +1 -1
  5. followthemoney/cli/exports.py +6 -2
  6. followthemoney/cli/mapping.py +6 -4
  7. followthemoney/cli/sieve.py +1 -1
  8. followthemoney/cli/statement.py +62 -0
  9. followthemoney/cli/util.py +2 -3
  10. followthemoney/compare.py +26 -16
  11. followthemoney/dataset/__init__.py +17 -0
  12. followthemoney/dataset/catalog.py +77 -0
  13. followthemoney/dataset/coverage.py +29 -0
  14. followthemoney/dataset/dataset.py +137 -0
  15. followthemoney/dataset/publisher.py +25 -0
  16. followthemoney/dataset/resource.py +30 -0
  17. followthemoney/dataset/util.py +58 -0
  18. followthemoney/entity.py +73 -0
  19. followthemoney/exc.py +6 -0
  20. followthemoney/export/common.py +3 -3
  21. followthemoney/export/csv.py +10 -12
  22. followthemoney/export/neo4j.py +1 -1
  23. followthemoney/export/rdf.py +57 -5
  24. followthemoney/graph.py +6 -4
  25. followthemoney/mapping/csv.py +6 -18
  26. followthemoney/mapping/sql.py +3 -4
  27. followthemoney/model.py +36 -9
  28. followthemoney/namespace.py +3 -1
  29. followthemoney/ontology.py +18 -16
  30. followthemoney/property.py +12 -15
  31. followthemoney/proxy.py +44 -65
  32. followthemoney/schema/Analyzable.yaml +2 -3
  33. followthemoney/schema/BankAccount.yaml +2 -3
  34. followthemoney/schema/Company.yaml +0 -6
  35. followthemoney/schema/Contract.yaml +0 -1
  36. followthemoney/schema/CryptoWallet.yaml +1 -1
  37. followthemoney/schema/Document.yaml +0 -6
  38. followthemoney/schema/Interval.yaml +7 -0
  39. followthemoney/schema/LegalEntity.yaml +6 -0
  40. followthemoney/schema/License.yaml +2 -0
  41. followthemoney/schema/Page.yaml +0 -1
  42. followthemoney/schema/Person.yaml +0 -5
  43. followthemoney/schema/Sanction.yaml +1 -0
  44. followthemoney/schema/Thing.yaml +0 -2
  45. followthemoney/schema/UserAccount.yaml +6 -3
  46. followthemoney/schema.py +27 -39
  47. followthemoney/statement/__init__.py +19 -0
  48. followthemoney/statement/entity.py +437 -0
  49. followthemoney/statement/serialize.py +245 -0
  50. followthemoney/statement/statement.py +256 -0
  51. followthemoney/statement/util.py +31 -0
  52. followthemoney/types/__init__.py +66 -23
  53. followthemoney/types/address.py +3 -3
  54. followthemoney/types/checksum.py +3 -7
  55. followthemoney/types/common.py +9 -14
  56. followthemoney/types/country.py +3 -7
  57. followthemoney/types/date.py +21 -11
  58. followthemoney/types/email.py +0 -4
  59. followthemoney/types/entity.py +5 -11
  60. followthemoney/types/gender.py +6 -10
  61. followthemoney/types/identifier.py +9 -3
  62. followthemoney/types/ip.py +5 -9
  63. followthemoney/types/json.py +2 -2
  64. followthemoney/types/language.py +3 -7
  65. followthemoney/types/mimetype.py +4 -8
  66. followthemoney/types/name.py +7 -8
  67. followthemoney/types/number.py +88 -6
  68. followthemoney/types/phone.py +4 -11
  69. followthemoney/types/string.py +4 -4
  70. followthemoney/types/topic.py +3 -7
  71. followthemoney/types/url.py +5 -10
  72. followthemoney/util.py +12 -13
  73. followthemoney/value.py +67 -0
  74. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/METADATA +38 -34
  75. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/RECORD +78 -69
  76. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/entry_points.txt +1 -0
  77. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/licenses/LICENSE +1 -0
  78. followthemoney/offshore.py +0 -48
  79. followthemoney/rdf.py +0 -9
  80. followthemoney/schema/Assessment.yaml +0 -32
  81. followthemoney/schema/Post.yaml +0 -42
  82. followthemoney/types/iban.py +0 -58
  83. followthemoney/types/registry.py +0 -52
  84. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/WHEEL +0 -0
@@ -5,8 +5,8 @@ from banal import ensure_list
5
5
  from normality import stringify
6
6
  from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict
7
7
 
8
- from followthemoney.rdf import Literal, Identifier
9
- from followthemoney.util import get_locale
8
+ from followthemoney.value import Value
9
+ from followthemoney.util import get_locale, const
10
10
  from followthemoney.util import gettext, sanitize_text
11
11
 
12
12
  if TYPE_CHECKING:
@@ -29,7 +29,7 @@ class PropertyTypeToDict(TypedDict, total=False):
29
29
  class PropertyType(object):
30
30
  """Base class for all property types."""
31
31
 
32
- name: str = "any"
32
+ name: str = const("any")
33
33
  """A machine-facing, variable safe name for the given type."""
34
34
 
35
35
  group: Optional[str] = None
@@ -87,7 +87,7 @@ class PropertyType(object):
87
87
 
88
88
  def clean(
89
89
  self,
90
- raw: Any,
90
+ raw: Value,
91
91
  fuzzy: bool = False,
92
92
  format: Optional[str] = None,
93
93
  proxy: Optional["EntityProxy"] = None,
@@ -165,11 +165,6 @@ class PropertyType(object):
165
165
  be related to (e.g. using a country prefix on a phone number or IBAN)."""
166
166
  return None
167
167
 
168
- def rdf(self, value: str) -> Identifier:
169
- """Return an RDF term to represent the given value - either a string
170
- literal, or a URI reference."""
171
- return Literal(value)
172
-
173
168
  def pick(self, values: Sequence[str]) -> Optional[str]:
174
169
  """Pick the best value to show to the user."""
175
170
  raise NotImplementedError
@@ -178,7 +173,7 @@ class PropertyType(object):
178
173
  """Return an ID suitable to identify this entity as a typed node in a
179
174
  graph representation of some FtM data. It's usually the same as the the
180
175
  RDF form."""
181
- return str(self.rdf(value))
176
+ return f"{self.name}:{value}"
182
177
 
183
178
  def node_id_safe(self, value: Optional[str]) -> Optional[str]:
184
179
  """Wrapper for node_id to handle None values."""
@@ -186,7 +181,7 @@ class PropertyType(object):
186
181
  return None
187
182
  return self.node_id(value)
188
183
 
189
- def caption(self, value: str) -> Optional[str]:
184
+ def caption(self, value: str, format: Optional[str] = None) -> str:
190
185
  """Return a label for the given property value. This is often the same as the
191
186
  value, but for types like countries or languages, it would return the label,
192
187
  while other values like phone numbers can be formatted to be nicer to read."""
@@ -253,19 +248,19 @@ class EnumType(PropertyType):
253
248
 
254
249
  def clean_text(
255
250
  self,
256
- code: str,
251
+ text: str,
257
252
  fuzzy: bool = False,
258
253
  format: Optional[str] = None,
259
254
  proxy: Optional["EntityProxy"] = None,
260
255
  ) -> Optional[str]:
261
256
  """All code values are cleaned to be lowercase and trailing whitespace is
262
257
  removed."""
263
- code = code.lower().strip()
258
+ code = text.lower().strip()
264
259
  if code not in self.codes:
265
260
  return None
266
261
  return code
267
262
 
268
- def caption(self, value: str) -> str:
263
+ def caption(self, value: str, format: Optional[str] = None) -> str:
269
264
  """Given a code value, return the label that should be shown to a user."""
270
265
  return self.names.get(value, value)
271
266
 
@@ -3,9 +3,8 @@ from typing import Optional, TYPE_CHECKING
3
3
  from babel.core import Locale
4
4
  from rigour.territories import get_territory, get_ftm_countries
5
5
 
6
- from followthemoney.rdf import URIRef, Identifier
7
6
  from followthemoney.types.common import EnumType, EnumValues
8
- from followthemoney.util import defer as _
7
+ from followthemoney.util import const, defer as _
9
8
 
10
9
  if TYPE_CHECKING:
11
10
  from followthemoney.proxy import EntityProxy
@@ -17,8 +16,8 @@ class CountryType(EnumType):
17
16
  a number of unusual and controversial designations (e.g. the Soviet Union,
18
17
  Transnistria, Somaliland, Kosovo)."""
19
18
 
20
- name = "country"
21
- group = "countries"
19
+ name = const("country")
20
+ group = const("countries")
22
21
  label = _("Country")
23
22
  plural = _("Countries")
24
23
  matchable = True
@@ -52,6 +51,3 @@ class CountryType(EnumType):
52
51
 
53
52
  def country_hint(self, value: str) -> str:
54
53
  return value
55
-
56
- def rdf(self, value: str) -> Identifier:
57
- return URIRef(f"iso-3166:{value}")
@@ -4,9 +4,8 @@ from typing import Optional, TYPE_CHECKING
4
4
  from prefixdate import parse, parse_format, Precision
5
5
 
6
6
  from followthemoney.types.common import PropertyType
7
- from followthemoney.rdf import XSD, Literal, Identifier
8
7
  from followthemoney.util import defer as _
9
- from followthemoney.util import dampen
8
+ from followthemoney.util import dampen, const
10
9
 
11
10
  if TYPE_CHECKING:
12
11
  from followthemoney.proxy import EntityProxy
@@ -21,8 +20,8 @@ class DateType(PropertyType):
21
20
  The timezone is always expected to be UTC and cannot be specified otherwise. There is
22
21
  no support for calendar weeks (`2021-W7`) and date ranges (`2021-2024`)."""
23
22
 
24
- name = "date"
25
- group = "dates"
23
+ name = const("date")
24
+ group = const("dates")
26
25
  label = _("Date")
27
26
  plural = _("Dates")
28
27
  matchable = True
@@ -57,18 +56,29 @@ class DateType(PropertyType):
57
56
  prefix = os.path.commonprefix([left, right])
58
57
  return dampen(4, 10, prefix)
59
58
 
60
- def rdf(self, value: str) -> Identifier:
61
- if len(value) < Precision.HOUR.value:
62
- return Literal(value, datatype=XSD.date)
63
- return Literal(value, datatype=XSD.dateTime)
59
+ def to_datetime(self, value: str) -> Optional[datetime]:
60
+ """Convert a date string to a datetime object in UTC for handling in Python. This
61
+ will convert the unset fields beyond the prefix to the first possible value, e.g.
62
+ `2021-02` will become `2021-02-01T00:00:00Z`.
64
63
 
65
- def node_id(self, value: str) -> str:
66
- return f"date:{value}"
64
+ Args:
65
+ value (str): The date string to convert.
67
66
 
68
- def to_datetime(self, value: str) -> Optional[datetime]:
67
+ Returns:
68
+ Optional[datetime]: The parsed datetime object in UTC, or None if parsing fails.
69
+ """
69
70
  return parse(value).dt
70
71
 
71
72
  def to_number(self, value: str) -> Optional[float]:
73
+ """Convert a date string to a number, which is the number of seconds since the epoch
74
+ (1970-01-01T00:00:00Z).
75
+
76
+ Args:
77
+ value (str): The date string to convert.
78
+
79
+ Returns:
80
+ Optional[float]: The timestamp as a float, or None if parsing fails.
81
+ """
72
82
  date = self.to_datetime(value)
73
83
  if date is None:
74
84
  return None
@@ -4,7 +4,6 @@ from typing import Optional, TYPE_CHECKING
4
4
  from urllib.parse import urlparse
5
5
  from normality.cleaning import strip_quotes
6
6
 
7
- from followthemoney.rdf import URIRef, Identifier
8
7
  from followthemoney.types.common import PropertyType
9
8
  from followthemoney.util import sanitize_text, defer as _
10
9
 
@@ -80,6 +79,3 @@ class EmailType(PropertyType):
80
79
 
81
80
  # def country_hint(self, value)
82
81
  # TODO: do we want to use TLDs as country evidence?
83
-
84
- def rdf(self, value: str) -> Identifier:
85
- return URIRef("mailto:%s" % value.lower())
@@ -2,9 +2,9 @@ import re
2
2
  from typing import Any, Optional, TYPE_CHECKING
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
- from followthemoney.rdf import URIRef, Identifier
5
+ from followthemoney.value import Value
6
6
  from followthemoney.util import ENTITY_ID_LEN, get_entity_id, sanitize_text
7
- from followthemoney.util import gettext, defer as _
7
+ from followthemoney.util import const, gettext, defer as _
8
8
  from followthemoney.exc import InvalidData
9
9
 
10
10
  if TYPE_CHECKING:
@@ -22,8 +22,8 @@ class EntityType(PropertyType):
22
22
 
23
23
  REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$"
24
24
  REGEX = re.compile(REGEX_RAW)
25
- name = "entity"
26
- group = "entities"
25
+ name = const("entity")
26
+ group = const("entities")
27
27
  label = _("Entity")
28
28
  plural = _("Entities")
29
29
  matchable = True
@@ -31,7 +31,7 @@ class EntityType(PropertyType):
31
31
  max_length = ENTITY_ID_LEN
32
32
 
33
33
  def validate(
34
- self, value: str, fuzzy: bool = False, format: Optional[str] = None
34
+ self, value: Value, fuzzy: bool = False, format: Optional[str] = None
35
35
  ) -> bool:
36
36
  text = sanitize_text(value)
37
37
  if text is None:
@@ -66,9 +66,3 @@ class EntityType(PropertyType):
66
66
  if self.REGEX.match(text) is not None:
67
67
  return text
68
68
  return None
69
-
70
- def rdf(self, value: str) -> Identifier:
71
- return URIRef(f"entity:{value}")
72
-
73
- def caption(self, value: str) -> None:
74
- return None
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
2
2
  from babel.core import Locale
3
3
 
4
4
  from followthemoney.types.common import EnumType, EnumValues
5
- from followthemoney.rdf import URIRef, Identifier
6
- from followthemoney.util import gettext, defer as _
5
+ from followthemoney.util import const, gettext, defer as _
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from followthemoney.proxy import EntityProxy
@@ -15,9 +14,9 @@ class GenderType(EnumType):
15
14
  government databases and represent it in a way that can be used by
16
15
  structured tools. I'm not sure this justifies the simplification."""
17
16
 
18
- MALE = "male"
19
- FEMALE = "female"
20
- OTHER = "other"
17
+ MALE = const("male")
18
+ FEMALE = const("female")
19
+ OTHER = const("other")
21
20
 
22
21
  LOOKUP = {
23
22
  "m": MALE,
@@ -35,8 +34,8 @@ class GenderType(EnumType):
35
34
  "divers": OTHER,
36
35
  }
37
36
 
38
- name = "gender"
39
- group = "genders"
37
+ name = const("gender")
38
+ group = const("genders")
40
39
  label = _("Gender")
41
40
  plural = _("Genders")
42
41
  matchable = False
@@ -61,6 +60,3 @@ class GenderType(EnumType):
61
60
  if code not in self.codes:
62
61
  return None
63
62
  return code
64
-
65
- def rdf(self, value: str) -> Identifier:
66
- return URIRef(f"gender:{value}")
@@ -4,7 +4,7 @@ from rigour.ids import get_identifier_format_names, get_identifier_format
4
4
 
5
5
  from followthemoney.types.common import PropertyType
6
6
  from followthemoney.util import dampen, shortest, longest
7
- from followthemoney.util import defer as _
7
+ from followthemoney.util import const, defer as _
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from followthemoney.proxy import EntityProxy
@@ -20,8 +20,8 @@ class IdentifierType(PropertyType):
20
20
  Four- or five-digit industry classifiers create more noise than value."""
21
21
 
22
22
  COMPARE_CLEAN = re.compile(r"[\W_]+")
23
- name = "identifier"
24
- group = "identifiers"
23
+ name = const("identifier")
24
+ group = const("identifiers")
25
25
  label = _("Identifier")
26
26
  plural = _("Identifiers")
27
27
  matchable = True
@@ -59,3 +59,9 @@ class IdentifierType(PropertyType):
59
59
 
60
60
  def node_id(self, value: str) -> str:
61
61
  return f"id:{value}"
62
+
63
+ def caption(self, value: str, format: Optional[str] = None) -> str:
64
+ if format in get_identifier_format_names():
65
+ format_ = get_identifier_format(format)
66
+ return format_.format(value)
67
+ return value
@@ -2,8 +2,7 @@ from typing import Optional, TYPE_CHECKING
2
2
  from ipaddress import ip_address
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
- from followthemoney.rdf import URIRef, Identifier
6
- from followthemoney.util import defer as _
5
+ from followthemoney.util import const, defer as _
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from followthemoney.proxy import EntityProxy
@@ -14,10 +13,10 @@ class IpType(PropertyType):
14
13
  by the protocol versions 4 (e.g. `192.168.1.143`) and 6
15
14
  (e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
16
15
 
17
- name = "ip"
18
- group = "ips"
19
- label = _("IP-Address")
20
- plural = _("IP-Addresses")
16
+ name = const("ip")
17
+ group = const("ips")
18
+ label = _("IP Address")
19
+ plural = _("IP Addresses")
21
20
  matchable = True
22
21
  pivot = True
23
22
  max_length = 64
@@ -45,6 +44,3 @@ class IpType(PropertyType):
45
44
  return str(ip_address(text))
46
45
  except ValueError:
47
46
  return None
48
-
49
- def rdf(self, value: str) -> Identifier:
50
- return URIRef(f"ip:{value}")
@@ -3,7 +3,7 @@ from typing import Any, Optional, Sequence, TYPE_CHECKING
3
3
  from banal import ensure_list
4
4
 
5
5
  from followthemoney.types.common import PropertyType
6
- from followthemoney.util import sanitize_text, defer as _
6
+ from followthemoney.util import const, sanitize_text, defer as _
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from followthemoney.proxy import EntityProxy
@@ -14,7 +14,7 @@ class JsonType(PropertyType):
14
14
  and some other edge cases. It's a really bad idea and we should try to get rid
15
15
  of JSON properties."""
16
16
 
17
- name = "json"
17
+ name = const("json")
18
18
  group = None
19
19
  label = _("Nested data")
20
20
  plural = _("Nested data")
@@ -3,9 +3,8 @@ from babel.core import Locale
3
3
  from rigour.langs import iso_639_alpha3
4
4
 
5
5
  from followthemoney.types.common import EnumType, EnumValues
6
- from followthemoney.rdf import URIRef, Identifier
7
6
  from followthemoney.util import defer as _, gettext
8
- from followthemoney.util import get_env_list
7
+ from followthemoney.util import const, get_env_list
9
8
 
10
9
  if TYPE_CHECKING:
11
10
  from followthemoney.proxy import EntityProxy
@@ -17,8 +16,8 @@ class LanguageType(EnumType):
17
16
  for additional languages once there is a specific need for them to be
18
17
  supported."""
19
18
 
20
- name = "language"
21
- group = "languages"
19
+ name = const("language")
20
+ group = const("languages")
22
21
  label = _("Language")
23
22
  plural = _("Languages")
24
23
  matchable = False
@@ -120,6 +119,3 @@ class LanguageType(EnumType):
120
119
  if code not in self.LANGUAGES:
121
120
  return None
122
121
  return code
123
-
124
- def rdf(self, value: str) -> Identifier:
125
- return URIRef(f"iso-639:{value}")
@@ -3,8 +3,7 @@ from rigour.mime import normalize_mimetype, parse_mimetype
3
3
  from rigour.mime import DEFAULT
4
4
 
5
5
  from followthemoney.types.common import PropertyType
6
- from followthemoney.rdf import URIRef, Identifier
7
- from followthemoney.util import defer as _
6
+ from followthemoney.util import const, defer as _
8
7
 
9
8
  if TYPE_CHECKING:
10
9
  from followthemoney.proxy import EntityProxy
@@ -19,8 +18,8 @@ class MimeType(PropertyType):
19
18
  MIME type properties do not contain parameters as used in HTTP headers,
20
19
  like `charset=UTF-8`."""
21
20
 
22
- name = "mimetype"
23
- group = "mimetypes"
21
+ name = const("mimetype")
22
+ group = const("mimetypes")
24
23
  label = _("MIME-Type")
25
24
  plural = _("MIME-Types")
26
25
  matchable = False
@@ -37,8 +36,5 @@ class MimeType(PropertyType):
37
36
  return text
38
37
  return None
39
38
 
40
- def rdf(self, value: str) -> Identifier:
41
- return URIRef(f"urn:mimetype:{value}")
42
-
43
- def caption(self, value: str) -> str:
39
+ def caption(self, value: str, format: Optional[str] = None) -> str:
44
40
  return parse_mimetype(value).label or value
@@ -2,13 +2,12 @@ from typing import TYPE_CHECKING, Optional, Sequence
2
2
  from normality import slugify
3
3
  from normality.cleaning import collapse_spaces, strip_quotes
4
4
  from rigour.env import MAX_NAME_LENGTH
5
- from rigour.names import pick_name
5
+ from rigour.names import pick_name, tokenize_name
6
6
  from rigour.text.distance import levenshtein_similarity
7
- from fingerprints.cleanup import clean_name_light
8
7
 
9
8
  from followthemoney.types.common import PropertyType
10
9
  from followthemoney.util import dampen
11
- from followthemoney.util import defer as _
10
+ from followthemoney.util import const, defer as _
12
11
 
13
12
  if TYPE_CHECKING:
14
13
  from followthemoney.proxy import EntityProxy
@@ -22,8 +21,8 @@ class NameType(PropertyType):
22
21
  No validation rules apply, and things having multiple names must be considered
23
22
  a perfectly ordinary case."""
24
23
 
25
- name = "name"
26
- group = "names"
24
+ name = const("name")
25
+ group = const("names")
27
26
  label = _("Name")
28
27
  plural = _("Names")
29
28
  matchable = True
@@ -51,9 +50,9 @@ class NameType(PropertyType):
51
50
 
52
51
  def compare(self, left: str, right: str) -> float:
53
52
  """Compare two names for similarity."""
54
- left_clean = clean_name_light(left)
55
- right_clean = clean_name_light(right)
56
- if left_clean is None or right_clean is None:
53
+ left_clean = " ".join(tokenize_name(left.lower()))
54
+ right_clean = " ".join(tokenize_name(right.lower()))
55
+ if not len(left_clean) or not len(right_clean):
57
56
  return 0.0
58
57
  return levenshtein_similarity(
59
58
  left_clean,
@@ -1,8 +1,8 @@
1
1
  import re
2
- from typing import Optional
2
+ from typing import Optional, Tuple
3
3
 
4
4
  from followthemoney.types.common import PropertyType
5
- from followthemoney.util import defer as _
5
+ from followthemoney.util import const, defer as _
6
6
 
7
7
 
8
8
  class NumberType(PropertyType):
@@ -13,8 +13,18 @@ class NumberType(PropertyType):
13
13
  In the future we might want to enable annotations for format, units, or
14
14
  even to introduce a separate property type for monetary values."""
15
15
 
16
- CAST_RE = re.compile(r"[^0-9\-\.]")
17
- name = "number"
16
+ DECIMAL = "."
17
+ SEPARATOR = ","
18
+ PRECISION = 2
19
+
20
+ _NUM_UNIT_RE = (
21
+ f"(\\s?\\-?\\s?\\d+(?:{re.escape(DECIMAL)}\\d+)?)\\s*([^\\s\\d][^\\s]*)?"
22
+ )
23
+ NUM_UNIT_RE = re.compile(_NUM_UNIT_RE, re.UNICODE)
24
+ _FLOAT_FMT = "{:" + SEPARATOR + "." + str(PRECISION) + "f}"
25
+ _INT_FMT = "{:" + SEPARATOR + "d}"
26
+
27
+ name = const("number")
18
28
  label = _("Number")
19
29
  plural = _("Numbers")
20
30
  matchable = False
@@ -22,9 +32,81 @@ class NumberType(PropertyType):
22
32
  def node_id(self, value: str) -> None:
23
33
  return None
24
34
 
35
+ def parse(
36
+ self, value: str, decimal: str = DECIMAL, separator: str = SEPARATOR
37
+ ) -> Tuple[Optional[str], Optional[str]]:
38
+ """Parse a number into a numeric value and a unit. The numeric value is
39
+ aligned with the decimal and separator settings. The unit is stripped of
40
+ whitespace and returned as a string. If no unit is found, None is
41
+ returned. If no number is found, None is returned for both values.
42
+
43
+ Args:
44
+ value (str): The string to parse.
45
+ decimal (str): The character used as the decimal separator.
46
+ separator (str): The character used to separate thousands, lakhs, or crores.
47
+
48
+ Returns:
49
+ A tuple of (number, unit), where number is a string and unit is a string or None.
50
+ """
51
+ value = value.replace(separator, "")
52
+ if decimal != self.DECIMAL:
53
+ value = value.replace(decimal, self.DECIMAL)
54
+ match = self.NUM_UNIT_RE.match(value)
55
+ if not match:
56
+ return None, None
57
+ number, unit = match.groups()
58
+ if unit is not None:
59
+ unit = unit.strip()
60
+ if len(unit) == 0:
61
+ unit = None
62
+ # TODO: We could have a lookup table for common units, e.g. kg, m, etc. to
63
+ # convert them to a standard form.
64
+ number = number.replace(" ", "")
65
+ if number == "":
66
+ number = None
67
+ return number, unit
68
+
25
69
  def to_number(self, value: str) -> Optional[float]:
70
+ """Convert a number string to a float. The string is parsed and the unit is
71
+ discarded if present.
72
+
73
+ Args:
74
+ value (str): The string to convert.
75
+
76
+ Returns:
77
+ Optional[float]: The parsed float value, or None if parsing fails.
78
+ """
26
79
  try:
27
- value = self.CAST_RE.sub("", value)
28
- return float(value)
80
+ number, _ = self.parse(value)
81
+ if number is None:
82
+ return None
83
+ return float(number)
29
84
  except Exception:
30
85
  return None
86
+
87
+ def caption(self, value: str, format: Optional[str] = None) -> str:
88
+ """Return a caption for the number. This is used for display purposes.
89
+
90
+ Args:
91
+ value (str): The string to format.
92
+ format (Optional[str]): An optional format string to use for formatting the number.
93
+
94
+ Returns:
95
+ str: The formatted number string, possibly with a unit.
96
+ """
97
+ number, unit = self.parse(value)
98
+ if number is None:
99
+ return value
100
+ try:
101
+ fnumber = float(number)
102
+ except ValueError:
103
+ return value
104
+ if format is not None:
105
+ number = format.format(fnumber)
106
+ elif fnumber.is_integer():
107
+ number = self._INT_FMT.format(int(fnumber))
108
+ else:
109
+ number = self._FLOAT_FMT.format(fnumber)
110
+ if unit is not None:
111
+ return f"{number} {unit}"
112
+ return number
@@ -5,9 +5,8 @@ from phonenumbers import PhoneNumber, PhoneNumberFormat
5
5
  from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
6
6
 
7
7
  from followthemoney.types.common import PropertyType
8
- from followthemoney.rdf import URIRef, Identifier
9
8
  from followthemoney.util import defer as _
10
- from followthemoney.util import dampen
9
+ from followthemoney.util import const, dampen
11
10
 
12
11
  if TYPE_CHECKING:
13
12
  from followthemoney.proxy import EntityProxy
@@ -30,8 +29,8 @@ class PhoneType(PropertyType):
30
29
  validation outcome from doing the two operations the other way around. Always
31
30
  define the country first."""
32
31
 
33
- name = "phone"
34
- group = "phones"
32
+ name = const("phone")
33
+ group = const("phones")
35
34
  label = _("Phone number")
36
35
  plural = _("Phone numbers")
37
36
  matchable = True
@@ -97,16 +96,10 @@ class PhoneType(PropertyType):
97
96
  # TODO: insert artificial intelligence here.
98
97
  return dampen(7, 11, value)
99
98
 
100
- def rdf(self, value: str) -> Identifier:
101
- node_id = self.node_id(value)
102
- if node_id is not None:
103
- return URIRef(node_id)
104
- raise ValueError("Invalid phone number for serialisation: %s" % value)
105
-
106
99
  def node_id(self, value: str) -> Optional[str]:
107
100
  return f"tel:{value}"
108
101
 
109
- def caption(self, value: str) -> str:
102
+ def caption(self, value: str, format: Optional[str] = None) -> str:
110
103
  try:
111
104
  number = parse_number(value)
112
105
  formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
@@ -1,12 +1,12 @@
1
1
  from followthemoney.types.common import PropertyType
2
- from followthemoney.util import defer as _
2
+ from followthemoney.util import const, defer as _
3
3
  from followthemoney.util import MEGABYTE
4
4
 
5
5
 
6
6
  class StringType(PropertyType):
7
7
  """A simple string property with no additional semantics."""
8
8
 
9
- name = "string"
9
+ name = const("string")
10
10
  label = _("Label")
11
11
  plural = _("Labels")
12
12
  matchable = False
@@ -21,7 +21,7 @@ class TextType(StringType):
21
21
  string properties, it might make sense to treat properties of this type as
22
22
  full-text search material."""
23
23
 
24
- name = "text"
24
+ name = const("text")
25
25
  label = _("Text")
26
26
  plural = _("Texts")
27
27
  total_size = 30 * MEGABYTE
@@ -36,7 +36,7 @@ class HTMLType(StringType):
36
36
  to perform server-side sanitisation, or to not render this property at all.
37
37
  """
38
38
 
39
- name = "html"
39
+ name = const("html")
40
40
  label = _("HTML")
41
41
  plural = _("HTMLs")
42
42
  total_size = 30 * MEGABYTE
@@ -1,8 +1,7 @@
1
1
  from babel.core import Locale
2
2
 
3
3
  from followthemoney.types.common import EnumType, EnumValues
4
- from followthemoney.rdf import URIRef, Identifier
5
- from followthemoney.util import gettext, defer as _
4
+ from followthemoney.util import const, gettext, defer as _
6
5
 
7
6
 
8
7
  class TopicType(EnumType):
@@ -16,8 +15,8 @@ class TopicType(EnumType):
16
15
  enable queries such as _find all paths between a government procurement
17
16
  award and a politician_."""
18
17
 
19
- name = "topic"
20
- group = "topics"
18
+ name = const("topic")
19
+ group = const("topics")
21
20
  label = _("Topic")
22
21
  plural = _("Topics")
23
22
  matchable = False
@@ -90,6 +89,3 @@ class TopicType(EnumType):
90
89
 
91
90
  def _locale_names(self, locale: Locale) -> EnumValues:
92
91
  return {k: gettext(v) for (k, v) in self._TOPICS.items()}
93
-
94
- def rdf(self, value: str) -> Identifier:
95
- return URIRef(f"ftm:topic:{value}")