followthemoney 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,15 @@
1
1
  import csv
2
+ import sys
2
3
  import click
3
4
  import orjson
4
5
  import logging
5
6
  from io import TextIOWrapper
6
7
  from pathlib import Path
7
8
  from types import TracebackType
8
- from typing import cast
9
+ from typing import Dict, Tuple, cast
9
10
  from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
10
11
  from rigour.boolean import text_bool
12
+ from rigour.env import ENCODING
11
13
 
12
14
  from followthemoney.statement.statement import Statement, StatementDict
13
15
  from followthemoney.statement.util import unpack_prop
@@ -48,6 +50,7 @@ LEGACY_PACK_COLUMNS = [
48
50
  "first_seen",
49
51
  "last_seen",
50
52
  ]
53
+ csv.field_size_limit(sys.maxsize)
51
54
 
52
55
 
53
56
  def read_json_statements(
@@ -60,7 +63,7 @@ def read_json_statements(
60
63
 
61
64
 
62
65
  def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
63
- wrapped = TextIOWrapper(fh, encoding="utf-8")
66
+ wrapped = TextIOWrapper(fh, encoding=ENCODING)
64
67
  for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
65
68
  data = cast(StatementDict, row)
66
69
  data["external"] = text_bool(row.get("external")) or False
@@ -68,11 +71,13 @@ def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
68
71
  data["lang"] = None
69
72
  if row.get("original_value") == "":
70
73
  data["original_value"] = None
74
+ if row.get("origin") == "":
75
+ data["origin"] = None
71
76
  yield Statement.from_dict(data)
72
77
 
73
78
 
74
79
  def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
75
- wrapped = TextIOWrapper(fh, encoding="utf-8")
80
+ wrapped = TextIOWrapper(fh, encoding=ENCODING)
76
81
  yield from read_pack_statements_decoded(wrapped)
77
82
 
78
83
 
@@ -100,7 +105,7 @@ def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]
100
105
  dataset=data["dataset"],
101
106
  lang=data["lang"] or None,
102
107
  original_value=data["original_value"] or None,
103
- origin=data.get("origin"),
108
+ origin=data.get("origin") or None,
104
109
  first_seen=data["first_seen"],
105
110
  external=data["external"] == "t",
106
111
  canonical_id=data["entity_id"],
@@ -129,10 +134,10 @@ def read_path_statements(path: Path, format: str) -> Generator[Statement, None,
129
134
 
130
135
  def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
131
136
  if format == CSV:
132
- wrapped = TextIOWrapper(fh, encoding="utf-8")
137
+ wrapped = TextIOWrapper(fh, encoding=ENCODING)
133
138
  return CSVStatementWriter(wrapped)
134
139
  elif format == PACK:
135
- wrapped = TextIOWrapper(fh, encoding="utf-8")
140
+ wrapped = TextIOWrapper(fh, encoding=ENCODING)
136
141
  return PackStatementWriter(wrapped)
137
142
  elif format == JSON:
138
143
  return JSONStatementWriter(fh)
@@ -222,12 +227,14 @@ class PackStatementWriter(StatementWriter):
222
227
  "id",
223
228
  ]
224
229
  self.writer.writerow(columns)
225
- self._batch: List[List[Optional[str]]] = []
230
+ self._batch: Dict[str, Tuple[Optional[str], ...]] = {}
226
231
 
227
232
  def write(self, stmt: Statement) -> None:
228
233
  # HACK: This is very similar to the CSV writer, but at the very inner
229
234
  # loop of the application, so we're duplicating code here.
230
- row = [
235
+ if stmt.id is None:
236
+ raise RuntimeError("Cannot write pack statement without ID")
237
+ row = (
231
238
  stmt.entity_id,
232
239
  f"{stmt.schema}:{stmt.prop}",
233
240
  stmt.value,
@@ -239,13 +246,15 @@ class PackStatementWriter(StatementWriter):
239
246
  stmt.first_seen,
240
247
  stmt.last_seen,
241
248
  stmt.id,
242
- ]
243
- self._batch.append(row)
249
+ )
250
+ self._batch[stmt.id] = row
244
251
  if len(self._batch) >= CSV_BATCH:
245
- self.writer.writerows(self._batch)
246
- self._batch.clear()
252
+ self.flush()
253
+
254
+ def flush(self) -> None:
255
+ self.writer.writerows(self._batch.values())
256
+ self._batch.clear()
247
257
 
248
258
  def close(self) -> None:
249
- if len(self._batch) > 0:
250
- self.writer.writerows(self._batch)
259
+ self.flush()
251
260
  self.fh.close()
@@ -1,14 +1,22 @@
1
1
  import hashlib
2
2
  import warnings
3
3
  from sqlalchemy.engine import Row
4
- from typing import cast
5
- from typing import Any, Dict, Generator, Optional
4
+ from typing import Union, cast
5
+ from typing import Any, Dict, Generator, Optional, TypeGuard
6
6
  from typing_extensions import TypedDict, Self
7
7
  from rigour.time import datetime_iso, iso_datetime
8
8
  from rigour.boolean import bool_text
9
9
 
10
10
  from followthemoney.proxy import EntityProxy
11
- from followthemoney.statement.util import get_prop_type, BASE_ID
11
+ from followthemoney.statement.util import get_prop_type, BASE_ID, NON_LANG_TYPE_NAMES
12
+ from followthemoney.util import HASH_ENCODING
13
+
14
+
15
+ UNSET = object()
16
+
17
+
18
+ def is_not_unset(value: str | None | object) -> TypeGuard[str | None]:
19
+ return value is not UNSET
12
20
 
13
21
 
14
22
  class StatementDict(TypedDict):
@@ -42,15 +50,16 @@ class Statement(object):
42
50
 
43
51
  __slots__ = [
44
52
  "id",
45
- "entity_id",
53
+ "_entity_id",
46
54
  "canonical_id",
47
- "prop",
48
- "schema",
49
- "value",
50
- "dataset",
51
- "lang",
55
+ "_prop",
56
+ "_schema",
57
+ "_value",
58
+ "_dataset",
59
+ "_lang",
60
+ "prop_type",
52
61
  "original_value",
53
- "external",
62
+ "_external",
54
63
  "first_seen",
55
64
  "last_seen",
56
65
  "origin",
@@ -72,55 +81,95 @@ class Statement(object):
72
81
  last_seen: Optional[str] = None,
73
82
  origin: Optional[str] = None,
74
83
  ):
75
- self.entity_id = entity_id
84
+ self._entity_id = entity_id
76
85
  self.canonical_id = canonical_id or entity_id
77
- self.prop = prop
78
- self.schema = schema
79
- self.value = value
80
- self.dataset = dataset
81
- self.lang = lang
86
+ self._prop = prop
87
+ self._schema = schema
88
+ self.prop_type = get_prop_type(schema, prop)
89
+ self._value = value
90
+ self._dataset = dataset
91
+
92
+ # Remove lang for non-linguistic property types. The goal here is to avoid
93
+ # duplicate statements because of language tags, but the language metadata
94
+ # may be relevant as context for how the original_value was parsed so it's
95
+ # a bit of information loss.
96
+ if lang is not None:
97
+ if self.prop_type in NON_LANG_TYPE_NAMES:
98
+ lang = None
99
+ self._lang = lang
100
+
82
101
  self.original_value = original_value
83
102
  self.first_seen = first_seen
84
103
  self.last_seen = last_seen or first_seen
85
- self.external = external
104
+ self._external = external
86
105
  self.origin = origin
87
106
  if id is None:
88
107
  id = self.generate_key()
89
108
  self.id = id
90
109
 
91
110
  @property
92
- def prop_type(self) -> str:
93
- """The type of the property, e.g. 'string', 'number', 'url'."""
94
- return get_prop_type(self.schema, self.prop)
111
+ def entity_id(self) -> str:
112
+ """The (original) ID of the entity this statement is about."""
113
+ return self._entity_id
114
+
115
+ @property
116
+ def dataset(self) -> str:
117
+ """The dataset this statement was observed in."""
118
+ return self._dataset
119
+
120
+ @property
121
+ def prop(self) -> str:
122
+ """The property name this statement is about."""
123
+ return self._prop
124
+
125
+ @property
126
+ def schema(self) -> str:
127
+ """The schema of the entity this statement is about."""
128
+ return self._schema
129
+
130
+ @property
131
+ def value(self) -> str:
132
+ """The value of the property captured by this statement."""
133
+ return self._value
134
+
135
+ @property
136
+ def lang(self) -> Optional[str]:
137
+ """The language of the property value, if applicable."""
138
+ return self._lang
139
+
140
+ @property
141
+ def external(self) -> bool:
142
+ """Whether this statement was observed in an external dataset."""
143
+ return self._external
95
144
 
96
145
  def to_dict(self) -> StatementDict:
97
146
  return {
98
147
  "canonical_id": self.canonical_id,
99
- "entity_id": self.entity_id,
100
- "prop": self.prop,
101
- "schema": self.schema,
102
- "value": self.value,
103
- "dataset": self.dataset,
104
- "lang": self.lang,
148
+ "entity_id": self._entity_id,
149
+ "prop": self._prop,
150
+ "schema": self._schema,
151
+ "value": self._value,
152
+ "dataset": self._dataset,
153
+ "lang": self._lang,
105
154
  "original_value": self.original_value,
106
155
  "first_seen": self.first_seen,
107
156
  "last_seen": self.last_seen,
108
- "external": self.external,
157
+ "external": self._external,
109
158
  "origin": self.origin,
110
159
  "id": self.id,
111
160
  }
112
161
 
113
162
  def to_csv_row(self) -> Dict[str, Optional[str]]:
114
163
  data = cast(Dict[str, Optional[str]], self.to_dict())
115
- data["external"] = bool_text(self.external)
116
- data["prop_type"] = get_prop_type(self.schema, self.prop)
164
+ data["external"] = bool_text(self._external)
165
+ data["prop_type"] = self.prop_type
117
166
  return data
118
167
 
119
168
  def to_db_row(self) -> Dict[str, Any]:
120
169
  data = cast(Dict[str, Any], self.to_dict())
121
170
  data["first_seen"] = iso_datetime(self.first_seen)
122
171
  data["last_seen"] = iso_datetime(self.last_seen)
123
- data["prop_type"] = get_prop_type(self.schema, self.prop)
172
+ data["prop_type"] = self.prop_type
124
173
  return data
125
174
 
126
175
  def __hash__(self) -> int:
@@ -132,27 +181,83 @@ class Statement(object):
132
181
  return hash(self.id)
133
182
 
134
183
  def __repr__(self) -> str:
135
- return "<Statement(%r, %r, %r)>" % (self.entity_id, self.prop, self.value)
184
+ return "<Statement(%r, %r, %r)>" % (self._entity_id, self._prop, self._value)
136
185
 
137
186
  def __eq__(self, other: Any) -> bool:
138
187
  return not self.id != other.id
139
188
 
140
189
  def __lt__(self, other: Any) -> bool:
141
- self_key = (self.prop != BASE_ID, self.id or "")
142
- other_key = (other.prop != BASE_ID, other.id or "")
190
+ self_key = (self._prop != BASE_ID, self.id or "")
191
+ other_key = (other._prop != BASE_ID, other.id or "")
143
192
  return self_key < other_key
144
193
 
145
- def clone(self: Self) -> "Statement":
194
+ def clone(
195
+ self: Self,
196
+ *,
197
+ entity_id: Optional[str] = None,
198
+ prop: Optional[str] = None,
199
+ schema: Optional[str] = None,
200
+ value: Optional[str] = None,
201
+ dataset: Optional[str] = None,
202
+ lang: Union[str, None, object] = UNSET,
203
+ original_value: Union[str, None, object] = UNSET,
204
+ first_seen: Union[str, None, object] = UNSET,
205
+ external: Optional[bool] = None,
206
+ canonical_id: Optional[str] = None,
207
+ last_seen: Union[str, None, object] = UNSET,
208
+ origin: Union[str, None, object] = UNSET,
209
+ ) -> "Statement":
146
210
  """Make a deep copy of the given statement."""
147
- return Statement.from_dict(self.to_dict())
211
+ lang = lang if is_not_unset(lang) else self._lang
212
+ ov = original_value if is_not_unset(original_value) else self.original_value
213
+ first_seen = first_seen if is_not_unset(first_seen) else self.first_seen
214
+ last_seen = last_seen if is_not_unset(last_seen) else self.last_seen
215
+ origin = origin if is_not_unset(origin) else self.origin
216
+ if external is None:
217
+ external = self._external
218
+ if canonical_id is None and self._entity_id != self.canonical_id:
219
+ canonical_id = self.canonical_id
220
+
221
+ # Decide if the statement ID can be kept the same:
222
+ stmt_id = self.id
223
+ if entity_id is not None and entity_id != self.entity_id:
224
+ stmt_id = None
225
+ if prop is not None and prop != self._prop:
226
+ stmt_id = None
227
+ if schema is not None and schema != self._schema:
228
+ stmt_id = None
229
+ if value is not None and value != self._value:
230
+ stmt_id = None
231
+ if dataset is not None and dataset != self._dataset:
232
+ stmt_id = None
233
+ if external != self._external:
234
+ stmt_id = None
235
+ if lang != self._lang:
236
+ stmt_id = None
237
+ return Statement(
238
+ id=stmt_id,
239
+ entity_id=entity_id or self._entity_id,
240
+ prop=prop or self._prop,
241
+ schema=schema or self._schema,
242
+ value=value or self._value,
243
+ dataset=dataset or self._dataset,
244
+ lang=lang,
245
+ original_value=ov,
246
+ first_seen=first_seen,
247
+ external=external,
248
+ canonical_id=canonical_id,
249
+ last_seen=last_seen,
250
+ origin=origin,
251
+ )
148
252
 
149
253
  def generate_key(self) -> Optional[str]:
150
254
  return self.make_key(
151
- self.dataset,
152
- self.entity_id,
153
- self.prop,
154
- self.value,
155
- self.external,
255
+ self._dataset,
256
+ self._entity_id,
257
+ self._prop,
258
+ self._value,
259
+ self._external,
260
+ lang=self._lang,
156
261
  )
157
262
 
158
263
  @classmethod
@@ -163,17 +268,21 @@ class Statement(object):
163
268
  prop: str,
164
269
  value: str,
165
270
  external: Optional[bool],
271
+ lang: Optional[str] = None,
166
272
  ) -> Optional[str]:
167
273
  """Hash the key properties of a statement record to make a unique ID."""
168
274
  if prop is None or value is None:
169
275
  return None
170
- key = f"{dataset}.{entity_id}.{prop}.{value}"
276
+ if lang is None:
277
+ key = f"{dataset}.{entity_id}.{prop}.{value}"
278
+ else:
279
+ key = f"{dataset}.{entity_id}.{prop}.{value}@{lang}"
171
280
  if external:
172
281
  # We consider the external flag in key composition to avoid race conditions
173
282
  # where a certain entity might be emitted as external while it is already
174
283
  # linked in to the graph via another route.
175
284
  key = f"{key}.ext"
176
- return hashlib.sha1(key.encode("utf-8")).hexdigest()
285
+ return hashlib.sha1(key.encode(HASH_ENCODING)).hexdigest()
177
286
 
178
287
  @classmethod
179
288
  def from_dict(cls, data: StatementDict) -> "Statement":
@@ -2,10 +2,31 @@ from functools import cache
2
2
  from typing import Tuple
3
3
 
4
4
  from followthemoney.model import Model
5
+ from followthemoney.types import registry
5
6
  from followthemoney.util import const
6
7
 
7
8
  BASE_ID = "id"
8
9
 
10
+ # Some property types should not set the `lang` attribute on statements.
11
+ # These are typically non-linguistic types, although there's an argument
12
+ # that language metadata could be useful for dates and countries, where
13
+ # text parsing is likely to have taken place.
14
+ NON_LANG_TYPE_NAMES = {
15
+ registry.entity.name,
16
+ registry.date.name,
17
+ registry.checksum.name,
18
+ registry.email.name,
19
+ registry.phone.name,
20
+ registry.gender.name,
21
+ registry.mimetype.name,
22
+ registry.topic.name,
23
+ registry.url.name,
24
+ registry.country.name,
25
+ registry.language.name,
26
+ registry.ip.name,
27
+ BASE_ID,
28
+ }
29
+
9
30
 
10
31
  def pack_prop(schema: str, prop: str) -> str:
11
32
  return f"{schema}:{prop}"
@@ -1,6 +1,7 @@
1
- from typing import Optional, TYPE_CHECKING
1
+ from typing import Callable, Optional, TYPE_CHECKING, Sequence
2
2
  from babel.core import Locale
3
3
  from rigour.territories import get_ftm_countries, lookup_territory
4
+ from rigour.territories import territories_intersect
4
5
 
5
6
  from followthemoney.types.common import EnumType, EnumValues
6
7
  from followthemoney.util import defer as _
@@ -25,6 +26,20 @@ class CountryType(EnumType):
25
26
  def _locale_names(self, locale: Locale) -> EnumValues:
26
27
  return {t.code: t.name for t in get_ftm_countries()}
27
28
 
29
+ def compare(self, left: str, right: str) -> float:
30
+ overlap = territories_intersect([left], [right])
31
+ return 1.0 if len(overlap) else 0.0
32
+
33
+ def compare_sets(
34
+ self,
35
+ left: Sequence[str],
36
+ right: Sequence[str],
37
+ func: Callable[[Sequence[float]], float] = max,
38
+ ) -> float:
39
+ """Compare two sets of values and select the highest-scored result."""
40
+ overlap = territories_intersect(left, right)
41
+ return 1.0 if len(overlap) else 0.0
42
+
28
43
  def clean_text(
29
44
  self,
30
45
  text: str,
@@ -27,6 +27,16 @@ class DateType(PropertyType):
27
27
  matchable = True
28
28
  max_length = 32
29
29
 
30
+ HISTORIC = "1001-01-01"
31
+ """A sentinel date value representing a very old date, used to indicate historic (and often imprecise) dates
32
+ that can be assumed to be long in the past."""
33
+
34
+ RELEVANCE_MIN = "1900-01-01"
35
+ """A cutoff date value representing the minimum relevant date for modern fincrime applications."""
36
+
37
+ RELEVANCE_MAX = "2100-12-31"
38
+ """A cutoff date value representing the maximum relevant date for modern fincrime applications."""
39
+
30
40
  def validate(
31
41
  self, value: str, fuzzy: bool = False, format: Optional[str] = None
32
42
  ) -> bool:
@@ -4,7 +4,7 @@ from rigour.langs import iso_639_alpha3
4
4
 
5
5
  from followthemoney.types.common import EnumType, EnumValues
6
6
  from followthemoney.util import defer as _, gettext
7
- from followthemoney.util import get_env_list
7
+ from followthemoney.settings import get_env_list
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from followthemoney.proxy import EntityProxy
followthemoney/util.py CHANGED
@@ -10,10 +10,11 @@ from threading import local
10
10
  from typing import cast, Dict, Any, List, Optional, TypeVar, Union
11
11
  from normality import stringify
12
12
  from normality.cleaning import remove_unsafe_chars
13
- from normality.encoding import DEFAULT_ENCODING
13
+ from rigour.env import ENCODING
14
14
  from banal import is_mapping, unique_list, ensure_list
15
15
 
16
16
  MEGABYTE = 1024 * 1024
17
+ HASH_ENCODING = "utf-8"
17
18
  DEFAULT_LOCALE = "en"
18
19
  ENTITY_ID_LEN = 200
19
20
 
@@ -55,16 +56,7 @@ def get_locale() -> Locale:
55
56
  return Locale.parse(state.locale)
56
57
 
57
58
 
58
- def get_env_list(name: str, default: List[str] = []) -> List[str]:
59
- value = stringify(os.environ.get(name))
60
- if value is not None:
61
- values = value.split(":")
62
- if len(values):
63
- return values
64
- return default
65
-
66
-
67
- def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
59
+ def sanitize_text(value: Any, encoding: str = ENCODING) -> Optional[str]:
68
60
  text = stringify(value, encoding_default=encoding)
69
61
  if text is None:
70
62
  return None
@@ -74,8 +66,8 @@ def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]
74
66
  log.warning("Cannot NFC text: %s", ex)
75
67
  return None
76
68
  text = remove_unsafe_chars(text)
77
- byte_text = text.encode(DEFAULT_ENCODING, "replace")
78
- text = byte_text.decode(DEFAULT_ENCODING, "replace")
69
+ byte_text = text.encode("utf-8", "replace")
70
+ text = byte_text.decode("utf-8", "replace")
79
71
  if len(text) == 0:
80
72
  return None
81
73
  return text
@@ -88,7 +80,7 @@ def key_bytes(key: Any) -> bytes:
88
80
  text = stringify(key)
89
81
  if text is None:
90
82
  return b""
91
- return text.encode("utf-8")
83
+ return text.encode(ENCODING)
92
84
 
93
85
 
94
86
  def join_text(*parts: Any, sep: str = " ") -> Optional[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: followthemoney
3
- Version: 4.3.4
3
+ Version: 4.5.1
4
4
  Summary: A data model for anti corruption data modeling and analysis.
5
5
  Project-URL: Documentation, https://followthemoney.tech/
6
6
  Project-URL: Repository, https://github.com/opensanctions/followthemoney.git
@@ -48,9 +48,9 @@ Requires-Dist: prefixdate<1.0.0,>=0.5.0
48
48
  Requires-Dist: pydantic<3.0.0,>=2.11.0
49
49
  Requires-Dist: pytz>=2021.1
50
50
  Requires-Dist: pyyaml<7.0.0,>=5.0.0
51
- Requires-Dist: rdflib<7.5.0,>=6.2.0
51
+ Requires-Dist: rdflib<7.6.0,>=6.2.0
52
52
  Requires-Dist: requests<3.0.0,>=2.21.0
53
- Requires-Dist: rigour<2.0.0,>=1.4.0
53
+ Requires-Dist: rigour<2.0.0,>=1.6.0
54
54
  Requires-Dist: sqlalchemy[mypy]<3.0.0,>=2.0.0
55
55
  Provides-Extra: dev
56
56
  Requires-Dist: build; extra == 'dev'