followthemoney 3.8.4__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. followthemoney/__init__.py +30 -10
  2. followthemoney/cli/__init__.py +3 -12
  3. followthemoney/cli/aggregate.py +1 -1
  4. followthemoney/cli/cli.py +1 -1
  5. followthemoney/cli/exports.py +6 -2
  6. followthemoney/cli/mapping.py +6 -4
  7. followthemoney/cli/sieve.py +1 -1
  8. followthemoney/cli/statement.py +62 -0
  9. followthemoney/cli/util.py +2 -3
  10. followthemoney/compare.py +26 -16
  11. followthemoney/dataset/__init__.py +17 -0
  12. followthemoney/dataset/catalog.py +77 -0
  13. followthemoney/dataset/coverage.py +29 -0
  14. followthemoney/dataset/dataset.py +137 -0
  15. followthemoney/dataset/publisher.py +25 -0
  16. followthemoney/dataset/resource.py +30 -0
  17. followthemoney/dataset/util.py +58 -0
  18. followthemoney/entity.py +73 -0
  19. followthemoney/exc.py +6 -0
  20. followthemoney/export/common.py +3 -3
  21. followthemoney/export/csv.py +10 -12
  22. followthemoney/export/neo4j.py +1 -1
  23. followthemoney/export/rdf.py +57 -5
  24. followthemoney/graph.py +6 -4
  25. followthemoney/mapping/csv.py +6 -18
  26. followthemoney/mapping/sql.py +3 -4
  27. followthemoney/model.py +36 -9
  28. followthemoney/namespace.py +3 -1
  29. followthemoney/ontology.py +18 -16
  30. followthemoney/property.py +12 -15
  31. followthemoney/proxy.py +44 -65
  32. followthemoney/schema/Analyzable.yaml +2 -3
  33. followthemoney/schema/BankAccount.yaml +2 -3
  34. followthemoney/schema/Company.yaml +0 -6
  35. followthemoney/schema/Contract.yaml +0 -1
  36. followthemoney/schema/CryptoWallet.yaml +1 -1
  37. followthemoney/schema/Document.yaml +0 -6
  38. followthemoney/schema/Interval.yaml +7 -0
  39. followthemoney/schema/LegalEntity.yaml +6 -0
  40. followthemoney/schema/License.yaml +2 -0
  41. followthemoney/schema/Page.yaml +0 -1
  42. followthemoney/schema/Person.yaml +0 -5
  43. followthemoney/schema/Sanction.yaml +1 -0
  44. followthemoney/schema/Thing.yaml +0 -2
  45. followthemoney/schema/UserAccount.yaml +6 -3
  46. followthemoney/schema.py +27 -39
  47. followthemoney/statement/__init__.py +19 -0
  48. followthemoney/statement/entity.py +437 -0
  49. followthemoney/statement/serialize.py +245 -0
  50. followthemoney/statement/statement.py +256 -0
  51. followthemoney/statement/util.py +31 -0
  52. followthemoney/types/__init__.py +66 -23
  53. followthemoney/types/address.py +3 -3
  54. followthemoney/types/checksum.py +3 -7
  55. followthemoney/types/common.py +9 -14
  56. followthemoney/types/country.py +3 -7
  57. followthemoney/types/date.py +21 -11
  58. followthemoney/types/email.py +0 -4
  59. followthemoney/types/entity.py +5 -11
  60. followthemoney/types/gender.py +6 -10
  61. followthemoney/types/identifier.py +9 -3
  62. followthemoney/types/ip.py +5 -9
  63. followthemoney/types/json.py +2 -2
  64. followthemoney/types/language.py +3 -7
  65. followthemoney/types/mimetype.py +4 -8
  66. followthemoney/types/name.py +7 -8
  67. followthemoney/types/number.py +88 -6
  68. followthemoney/types/phone.py +4 -11
  69. followthemoney/types/string.py +4 -4
  70. followthemoney/types/topic.py +3 -7
  71. followthemoney/types/url.py +5 -10
  72. followthemoney/util.py +12 -13
  73. followthemoney/value.py +67 -0
  74. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/METADATA +38 -34
  75. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/RECORD +78 -69
  76. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/entry_points.txt +1 -0
  77. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/licenses/LICENSE +1 -0
  78. followthemoney/offshore.py +0 -48
  79. followthemoney/rdf.py +0 -9
  80. followthemoney/schema/Assessment.yaml +0 -32
  81. followthemoney/schema/Post.yaml +0 -42
  82. followthemoney/types/iban.py +0 -58
  83. followthemoney/types/registry.py +0 -52
  84. {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,245 @@
1
+ import csv
2
+ import click
3
+ import orjson
4
+ from io import TextIOWrapper
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import cast
8
+ from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
9
+ from rigour.boolean import text_bool
10
+
11
+ from followthemoney.statement.statement import Statement, StatementDict
12
+ from followthemoney.statement.util import unpack_prop
13
+
14
+
15
+ JSON = "json"
16
+ CSV = "csv"
17
+ PACK = "pack"
18
+ FORMATS = [JSON, CSV, PACK]
19
+
20
+ CSV_BATCH = 5000
21
+ CSV_COLUMNS = [
22
+ "canonical_id",
23
+ "entity_id",
24
+ "prop",
25
+ "prop_type",
26
+ "schema",
27
+ "value",
28
+ "dataset",
29
+ "origin",
30
+ "lang",
31
+ "original_value",
32
+ "external",
33
+ "first_seen",
34
+ "last_seen",
35
+ "id",
36
+ ]
37
+ LEGACY_PACK_COLUMNS = [
38
+ "entity_id",
39
+ "prop",
40
+ "value",
41
+ "dataset",
42
+ "lang",
43
+ "original_value",
44
+ "target",
45
+ "external",
46
+ "first_seen",
47
+ "last_seen",
48
+ ]
49
+
50
+
51
+ def read_json_statements(
52
+ fh: BinaryIO,
53
+ max_line: int = 40 * 1024 * 1024,
54
+ ) -> Generator[Statement, None, None]:
55
+ while line := fh.readline(max_line):
56
+ data = orjson.loads(line)
57
+ yield Statement.from_dict(data)
58
+
59
+
60
+ def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
61
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
62
+ for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
63
+ data = cast(StatementDict, row)
64
+ data["external"] = text_bool(row.get("external")) or False
65
+ if row.get("lang") == "":
66
+ data["lang"] = None
67
+ if row.get("original_value") == "":
68
+ data["original_value"] = None
69
+ yield Statement.from_dict(data)
70
+
71
+
72
+ def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
73
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
74
+ yield from read_pack_statements_decoded(wrapped)
75
+
76
+
77
+ def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]:
78
+ headers: Optional[List[str]] = None
79
+ for row in csv.reader(fh, dialect=csv.unix_dialect):
80
+ if headers is None:
81
+ if "entity_id" in row and "prop" in row:
82
+ headers = row
83
+ else:
84
+ # This is a legacy pack file, with no headers.
85
+ headers = LEGACY_PACK_COLUMNS
86
+ continue
87
+ data = dict(zip(headers, row))
88
+ schema, _, prop = unpack_prop(data["prop"])
89
+ yield Statement(
90
+ entity_id=data["entity_id"],
91
+ prop=prop,
92
+ schema=schema,
93
+ value=data["value"],
94
+ dataset=data["dataset"],
95
+ lang=data.get("lang") or None,
96
+ original_value=data.get("original_value") or None,
97
+ origin=data.get("origin"),
98
+ first_seen=data["first_seen"],
99
+ external=data["external"] == "t",
100
+ canonical_id=data["entity_id"],
101
+ last_seen=data["last_seen"],
102
+ id=data.get("id"),
103
+ )
104
+
105
+
106
+ def read_statements(fh: BinaryIO, format: str) -> Generator[Statement, None, None]:
107
+ if format == CSV:
108
+ yield from read_csv_statements(fh)
109
+ elif format == PACK:
110
+ yield from read_pack_statements(fh)
111
+ else:
112
+ yield from read_json_statements(fh)
113
+
114
+
115
+ def read_path_statements(path: Path, format: str) -> Generator[Statement, None, None]:
116
+ if str(path) == "-":
117
+ fh = click.get_binary_stream("stdin")
118
+ yield from read_statements(fh, format=format)
119
+ return
120
+ with open(path, "rb") as fh:
121
+ yield from read_statements(fh, format=format)
122
+
123
+
124
+ def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
125
+ if format == CSV:
126
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
127
+ return CSVStatementWriter(wrapped)
128
+ elif format == PACK:
129
+ wrapped = TextIOWrapper(fh, encoding="utf-8")
130
+ return PackStatementWriter(wrapped)
131
+ elif format == JSON:
132
+ return JSONStatementWriter(fh)
133
+ raise RuntimeError("Unknown statement format: %s" % format)
134
+
135
+
136
+ def write_statements(
137
+ fh: BinaryIO, format: str, statements: Iterable[Statement]
138
+ ) -> None:
139
+ writer = get_statement_writer(fh, format)
140
+ for stmt in statements:
141
+ writer.write(stmt)
142
+ writer.close()
143
+
144
+
145
+ class StatementWriter(object):
146
+ def write(self, stmt: Statement) -> None:
147
+ raise NotImplementedError()
148
+
149
+ def close(self) -> None:
150
+ raise NotImplementedError()
151
+
152
+ def __enter__(self) -> "StatementWriter":
153
+ return self
154
+
155
+ def __exit__(
156
+ self,
157
+ type: Optional[Type[BaseException]],
158
+ value: Optional[BaseException],
159
+ traceback: Optional[TracebackType],
160
+ ) -> None:
161
+ self.close()
162
+
163
+
164
+ class JSONStatementWriter(StatementWriter):
165
+ def __init__(self, fh: BinaryIO) -> None:
166
+ self.fh = fh
167
+
168
+ def write(self, stmt: Statement) -> None:
169
+ data = stmt.to_dict()
170
+ out = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
171
+ self.fh.write(out)
172
+
173
+ def close(self) -> None:
174
+ self.fh.close()
175
+
176
+
177
+ class CSVStatementWriter(StatementWriter):
178
+ def __init__(self, fh: TextIO) -> None:
179
+ self.fh = fh
180
+ self.writer = csv.writer(self.fh, dialect=csv.unix_dialect)
181
+ self.writer.writerow(CSV_COLUMNS)
182
+ self._batch: List[List[Optional[str]]] = []
183
+
184
+ def write(self, stmt: Statement) -> None:
185
+ row = stmt.to_csv_row()
186
+ self._batch.append([row[c] for c in CSV_COLUMNS])
187
+ if len(self._batch) >= CSV_BATCH:
188
+ self.writer.writerows(self._batch)
189
+ self._batch.clear()
190
+
191
+ def close(self) -> None:
192
+ if len(self._batch) > 0:
193
+ self.writer.writerows(self._batch)
194
+ self.fh.close()
195
+
196
+
197
+ class PackStatementWriter(StatementWriter):
198
+ def __init__(self, fh: TextIO) -> None:
199
+ self.fh = fh
200
+ self.writer = csv.writer(
201
+ self.fh,
202
+ dialect=csv.unix_dialect,
203
+ quoting=csv.QUOTE_MINIMAL,
204
+ )
205
+ columns = [
206
+ "entity_id",
207
+ "prop",
208
+ "value",
209
+ "dataset",
210
+ "lang",
211
+ "original_value",
212
+ "origin",
213
+ "external",
214
+ "first_seen",
215
+ "last_seen",
216
+ "id",
217
+ ]
218
+ self.writer.writerow(columns)
219
+ self._batch: List[List[Optional[str]]] = []
220
+
221
+ def write(self, stmt: Statement) -> None:
222
+ # HACK: This is very similar to the CSV writer, but at the very inner
223
+ # loop of the application, so we're duplicating code here.
224
+ row = [
225
+ stmt.entity_id,
226
+ f"{stmt.schema}:{stmt.prop}",
227
+ stmt.value,
228
+ stmt.dataset,
229
+ stmt.lang,
230
+ stmt.original_value,
231
+ stmt.origin,
232
+ "t" if stmt.external else None,
233
+ stmt.first_seen,
234
+ stmt.last_seen,
235
+ stmt.id,
236
+ ]
237
+ self._batch.append(row)
238
+ if len(self._batch) >= CSV_BATCH:
239
+ self.writer.writerows(self._batch)
240
+ self._batch.clear()
241
+
242
+ def close(self) -> None:
243
+ if len(self._batch) > 0:
244
+ self.writer.writerows(self._batch)
245
+ self.fh.close()
@@ -0,0 +1,256 @@
1
+ import hashlib
2
+ import warnings
3
+ from sqlalchemy.engine import Row
4
+ from typing import cast
5
+ from typing import Any, Dict, Generator, Optional
6
+ from typing_extensions import TypedDict, Self
7
+ from rigour.time import datetime_iso, iso_datetime
8
+ from rigour.boolean import bool_text
9
+
10
+ from followthemoney.proxy import EntityProxy
11
+ from followthemoney.statement.util import get_prop_type, BASE_ID
12
+
13
+
14
+ class StatementDict(TypedDict):
15
+ id: Optional[str]
16
+ entity_id: str
17
+ canonical_id: str
18
+ prop: str
19
+ schema: str
20
+ value: str
21
+ dataset: str
22
+ lang: Optional[str]
23
+ original_value: Optional[str]
24
+ external: bool
25
+ first_seen: Optional[str]
26
+ last_seen: Optional[str]
27
+ origin: Optional[str]
28
+
29
+
30
+ class Statement(object):
31
+ """A single statement about a property relevant to an entity.
32
+
33
+ For example, this could be used to say: "In dataset A, entity X has the
34
+ property `name` set to 'John Smith'. I first observed this at K, and last
35
+ saw it at L."
36
+
37
+ Null property values are not supported. This might need to change if we
38
+ want to support making property-less entities.
39
+ """
40
+
41
+ BASE = BASE_ID
42
+
43
+ __slots__ = [
44
+ "id",
45
+ "entity_id",
46
+ "canonical_id",
47
+ "prop",
48
+ "schema",
49
+ "value",
50
+ "dataset",
51
+ "lang",
52
+ "original_value",
53
+ "external",
54
+ "first_seen",
55
+ "last_seen",
56
+ "origin",
57
+ ]
58
+
59
+ def __init__(
60
+ self,
61
+ entity_id: str,
62
+ prop: str,
63
+ schema: str,
64
+ value: str,
65
+ dataset: str,
66
+ lang: Optional[str] = None,
67
+ original_value: Optional[str] = None,
68
+ first_seen: Optional[str] = None,
69
+ external: bool = False,
70
+ id: Optional[str] = None,
71
+ canonical_id: Optional[str] = None,
72
+ last_seen: Optional[str] = None,
73
+ origin: Optional[str] = None,
74
+ ):
75
+ self.entity_id = entity_id
76
+ self.canonical_id = canonical_id or entity_id
77
+ self.prop = prop
78
+ self.schema = schema
79
+ self.value = value
80
+ self.dataset = dataset
81
+ self.lang = lang
82
+ self.original_value = original_value
83
+ self.first_seen = first_seen
84
+ self.last_seen = last_seen or first_seen
85
+ self.external = external
86
+ self.origin = origin
87
+ if id is None:
88
+ id = self.generate_key()
89
+ self.id = id
90
+
91
+ @property
92
+ def prop_type(self) -> str:
93
+ """The type of the property, e.g. 'string', 'number', 'url'."""
94
+ return get_prop_type(self.schema, self.prop)
95
+
96
+ def to_dict(self) -> StatementDict:
97
+ return {
98
+ "canonical_id": self.canonical_id,
99
+ "entity_id": self.entity_id,
100
+ "prop": self.prop,
101
+ "schema": self.schema,
102
+ "value": self.value,
103
+ "dataset": self.dataset,
104
+ "lang": self.lang,
105
+ "original_value": self.original_value,
106
+ "first_seen": self.first_seen,
107
+ "last_seen": self.last_seen,
108
+ "external": self.external,
109
+ "origin": self.origin,
110
+ "id": self.id,
111
+ }
112
+
113
+ def to_csv_row(self) -> Dict[str, Optional[str]]:
114
+ data = cast(Dict[str, Optional[str]], self.to_dict())
115
+ data["external"] = bool_text(self.external)
116
+ data["prop_type"] = get_prop_type(self.schema, self.prop)
117
+ return data
118
+
119
+ def to_db_row(self) -> Dict[str, Any]:
120
+ data = cast(Dict[str, Any], self.to_dict())
121
+ data["first_seen"] = iso_datetime(self.first_seen)
122
+ data["last_seen"] = iso_datetime(self.last_seen)
123
+ data["prop_type"] = get_prop_type(self.schema, self.prop)
124
+ return data
125
+
126
+ def __hash__(self) -> int:
127
+ if self.id is None:
128
+ warnings.warn(
129
+ "Hashing a statement without an ID results in undefined behaviour",
130
+ RuntimeWarning,
131
+ )
132
+ return hash(self.id)
133
+
134
+ def __repr__(self) -> str:
135
+ return "<Statement(%r, %r, %r)>" % (self.entity_id, self.prop, self.value)
136
+
137
+ def __eq__(self, other: Any) -> bool:
138
+ return not self.id != other.id
139
+
140
+ def __lt__(self, other: Any) -> bool:
141
+ self_key = (self.prop != BASE_ID, self.id or "")
142
+ other_key = (other.prop != BASE_ID, other.id or "")
143
+ return self_key < other_key
144
+
145
+ def clone(self: Self) -> "Statement":
146
+ """Make a deep copy of the given statement."""
147
+ return Statement.from_dict(self.to_dict())
148
+
149
+ def generate_key(self) -> Optional[str]:
150
+ return self.make_key(
151
+ self.dataset,
152
+ self.entity_id,
153
+ self.prop,
154
+ self.value,
155
+ self.external,
156
+ )
157
+
158
+ @classmethod
159
+ def make_key(
160
+ cls,
161
+ dataset: str,
162
+ entity_id: str,
163
+ prop: str,
164
+ value: str,
165
+ external: Optional[bool],
166
+ ) -> Optional[str]:
167
+ """Hash the key properties of a statement record to make a unique ID."""
168
+ if prop is None or value is None:
169
+ return None
170
+ key = f"{dataset}.{entity_id}.{prop}.{value}"
171
+ if external:
172
+ # We consider the external flag in key composition to avoid race conditions
173
+ # where a certain entity might be emitted as external while it is already
174
+ # linked in to the graph via another route.
175
+ key = f"{key}.ext"
176
+ return hashlib.sha1(key.encode("utf-8")).hexdigest()
177
+
178
+ @classmethod
179
+ def from_dict(cls, data: StatementDict) -> "Statement":
180
+ return cls(
181
+ entity_id=data["entity_id"],
182
+ prop=data["prop"],
183
+ schema=data["schema"],
184
+ value=data["value"],
185
+ dataset=data["dataset"],
186
+ lang=data.get("lang", None),
187
+ original_value=data.get("original_value", None),
188
+ first_seen=data.get("first_seen", None),
189
+ external=data.get("external", False),
190
+ id=data.get("id", None),
191
+ canonical_id=data.get("canonical_id", None),
192
+ last_seen=data.get("last_seen", None),
193
+ origin=data.get("origin", None),
194
+ )
195
+
196
+ @classmethod
197
+ def from_db_row(cls, row: Row[Any]) -> "Statement":
198
+ return cls(
199
+ id=row.id,
200
+ canonical_id=row.canonical_id,
201
+ entity_id=row.entity_id,
202
+ prop=row.prop,
203
+ schema=row.schema,
204
+ value=row.value,
205
+ dataset=row.dataset,
206
+ lang=row.lang,
207
+ original_value=row.original_value,
208
+ first_seen=datetime_iso(row.first_seen),
209
+ external=row.external,
210
+ last_seen=datetime_iso(row.last_seen),
211
+ origin=row.origin,
212
+ )
213
+
214
+ @classmethod
215
+ def from_entity(
216
+ cls,
217
+ entity: "EntityProxy",
218
+ dataset: str,
219
+ first_seen: Optional[str] = None,
220
+ last_seen: Optional[str] = None,
221
+ external: bool = False,
222
+ origin: Optional[str] = None,
223
+ ) -> Generator["Statement", None, None]:
224
+ from followthemoney.statement.entity import StatementEntity
225
+
226
+ if entity.id is None:
227
+ raise ValueError("Cannot create statements for entity without ID!")
228
+
229
+ # If the entity is already a StatementEntity, we return its statements directly.
230
+ if isinstance(entity, StatementEntity):
231
+ yield from entity.statements
232
+ return
233
+
234
+ yield cls(
235
+ entity_id=entity.id,
236
+ prop=BASE_ID,
237
+ schema=entity.schema.name,
238
+ value=entity.id,
239
+ dataset=dataset,
240
+ external=external,
241
+ first_seen=first_seen,
242
+ last_seen=last_seen,
243
+ origin=origin,
244
+ )
245
+ for prop, value in entity.itervalues():
246
+ yield cls(
247
+ entity_id=entity.id,
248
+ prop=prop.name,
249
+ schema=entity.schema.name,
250
+ value=value,
251
+ dataset=dataset,
252
+ external=external,
253
+ first_seen=first_seen,
254
+ last_seen=last_seen,
255
+ origin=origin,
256
+ )
@@ -0,0 +1,31 @@
1
+ import sys
2
+ from functools import cache
3
+ from typing import Tuple
4
+
5
+ from followthemoney.model import Model
6
+
7
+ BASE_ID = "id"
8
+
9
+
10
+ def pack_prop(schema: str, prop: str) -> str:
11
+ return f"{schema}:{prop}"
12
+
13
+
14
+ @cache
15
+ def get_prop_type(schema: str, prop: str) -> str:
16
+ if prop == BASE_ID:
17
+ return BASE_ID
18
+ schema_obj = Model.instance().get(schema)
19
+ if schema_obj is None:
20
+ raise TypeError("Schema not found: %s" % schema)
21
+ prop_obj = schema_obj.get(prop)
22
+ if prop_obj is None:
23
+ raise TypeError("Property not found: %s" % prop)
24
+ return prop_obj.type.name
25
+
26
+
27
+ @cache
28
+ def unpack_prop(id: str) -> Tuple[str, str, str]:
29
+ schema, prop = id.split(":", 1)
30
+ prop_type = get_prop_type(schema, prop)
31
+ return sys.intern(schema), prop_type, sys.intern(prop)
@@ -1,4 +1,6 @@
1
- from followthemoney.types.registry import Registry
1
+ from banal import ensure_list
2
+ from typing import Dict, Iterable, List, Set, cast
3
+
2
4
  from followthemoney.types.url import UrlType
3
5
  from followthemoney.types.name import NameType
4
6
  from followthemoney.types.email import EmailType
@@ -11,7 +13,6 @@ from followthemoney.types.language import LanguageType
11
13
  from followthemoney.types.mimetype import MimeType
12
14
  from followthemoney.types.checksum import ChecksumType
13
15
  from followthemoney.types.identifier import IdentifierType
14
- from followthemoney.types.iban import IbanType
15
16
  from followthemoney.types.entity import EntityType
16
17
  from followthemoney.types.topic import TopicType
17
18
  from followthemoney.types.gender import GenderType
@@ -22,27 +23,69 @@ from followthemoney.types.string import StringType
22
23
  from followthemoney.types.number import NumberType
23
24
  from followthemoney.types.common import PropertyType
24
25
 
26
+
27
+ class Registry(object):
28
+ """This registry keeps the processing helpers for all property types in the system. The
29
+ registry can be used to get a type, which can itself then clean, validate or format values
30
+ of that type."""
31
+
32
+ url = UrlType()
33
+ name = NameType()
34
+ email = EmailType()
35
+ ip = IpType()
36
+ address = AddressType()
37
+ date = DateType()
38
+ phone = PhoneType()
39
+ country = CountryType()
40
+ language = LanguageType()
41
+ mimetype = MimeType()
42
+ checksum = ChecksumType()
43
+ identifier = IdentifierType()
44
+ entity = EntityType()
45
+ topic = TopicType()
46
+ gender = GenderType()
47
+ json = JsonType()
48
+ text = TextType()
49
+ html = HTMLType()
50
+ string = StringType()
51
+ number = NumberType()
52
+
53
+ def __init__(self) -> None:
54
+ self.matchable: Set[PropertyType] = set()
55
+ self.types: Set[PropertyType] = set()
56
+ self.groups: Dict[str, PropertyType] = {}
57
+ self.pivots: Set[PropertyType] = set()
58
+ for name in dir(self):
59
+ type_ = getattr(self, name)
60
+ if not isinstance(type_, PropertyType):
61
+ continue
62
+ assert type_.name == name
63
+ self.types.add(type_)
64
+ if type_.matchable:
65
+ self.matchable.add(type_)
66
+ if type_.pivot:
67
+ self.pivots.add(type_)
68
+ if type_.group is not None:
69
+ self.groups[type_.group] = type_
70
+
71
+ def get(self, name: str) -> PropertyType:
72
+ """For a given property type name, get its type object. This can also
73
+ be used via getattr, e.g. ``registry.phone``."""
74
+ # Allow transparent re-checking.
75
+ if isinstance(name, PropertyType):
76
+ return name
77
+ return cast(PropertyType, getattr(self, name))
78
+
79
+ def get_types(self, names: Iterable[str]) -> List[PropertyType]:
80
+ """Get a list of all property type objects linked to a set of names."""
81
+ names = ensure_list(names)
82
+ types = [self.get(n) for n in names]
83
+ return [t for t in types if t is not None]
84
+
85
+ def __getitem__(self, name: str) -> PropertyType:
86
+ return cast(PropertyType, getattr(self, name))
87
+
88
+
25
89
  registry = Registry()
26
- registry.add(UrlType)
27
- registry.add(NameType)
28
- registry.add(EmailType)
29
- registry.add(IpType)
30
- registry.add(AddressType)
31
- registry.add(DateType)
32
- registry.add(PhoneType)
33
- registry.add(CountryType)
34
- registry.add(LanguageType)
35
- registry.add(MimeType)
36
- registry.add(ChecksumType)
37
- registry.add(IdentifierType)
38
- registry.add(IbanType) # TODO: remove
39
- registry.add(EntityType)
40
- registry.add(TopicType)
41
- registry.add(GenderType)
42
- registry.add(JsonType)
43
- registry.add(TextType)
44
- registry.add(HTMLType)
45
- registry.add(StringType)
46
- registry.add(NumberType)
47
90
 
48
91
  __all__ = ["PropertyType", "registry"]
@@ -7,7 +7,7 @@ from rigour.text.distance import levenshtein_similarity
7
7
 
8
8
  from followthemoney.types.common import PropertyType
9
9
  from followthemoney.util import defer as _
10
- from followthemoney.util import dampen
10
+ from followthemoney.util import dampen, const
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from followthemoney.proxy import EntityProxy
@@ -21,8 +21,8 @@ class AddressType(PropertyType):
21
21
 
22
22
  LINE_BREAKS = re.compile(r"(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)")
23
23
  COMMATA = re.compile(r"(,\s?[,\.])")
24
- name = "address"
25
- group = "addresses"
24
+ name = const("address")
25
+ group = const("addresses")
26
26
  label = _("Address")
27
27
  plural = _("Addresses")
28
28
  matchable = True
@@ -1,6 +1,5 @@
1
- from followthemoney.rdf import URIRef, Identifier
2
1
  from followthemoney.types.common import PropertyType
3
- from followthemoney.util import defer as _
2
+ from followthemoney.util import const, defer as _
4
3
 
5
4
 
6
5
  class ChecksumType(PropertyType):
@@ -13,13 +12,10 @@ class ChecksumType(PropertyType):
13
12
  of this type are scrubbed when submitted via the normal API. Checksums can only
14
13
  be defined by uploading a document to be ingested."""
15
14
 
16
- name = "checksum"
17
- group = "checksums"
15
+ name = const("checksum")
16
+ group = const("checksums")
18
17
  label = _("Checksum")
19
18
  plural = _("Checksums")
20
19
  matchable = True
21
20
  pivot = True
22
21
  max_length = 40
23
-
24
- def rdf(self, value: str) -> Identifier:
25
- return URIRef(f"hash:{value}")