followthemoney 4.3.0__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. followthemoney/__init__.py +1 -1
  2. followthemoney/compare.py +6 -0
  3. followthemoney/dataset/dataset.py +18 -0
  4. followthemoney/entity.py +29 -15
  5. followthemoney/mapping/csv.py +3 -1
  6. followthemoney/model.py +6 -5
  7. followthemoney/property.py +23 -4
  8. followthemoney/proxy.py +32 -11
  9. followthemoney/schema/Company.yaml +5 -0
  10. followthemoney/schema/CryptoWallet.yaml +4 -0
  11. followthemoney/schema/Image.yaml +7 -0
  12. followthemoney/schema/LegalEntity.yaml +10 -0
  13. followthemoney/schema/Organization.yaml +5 -0
  14. followthemoney/schema/Person.yaml +4 -0
  15. followthemoney/schema/PublicBody.yaml +4 -0
  16. followthemoney/schema/Thing.yaml +3 -2
  17. followthemoney/schema.py +16 -2
  18. followthemoney/settings.py +19 -0
  19. followthemoney/statement/entity.py +31 -7
  20. followthemoney/statement/serialize.py +18 -13
  21. followthemoney/statement/statement.py +151 -42
  22. followthemoney/statement/util.py +23 -2
  23. followthemoney/types/address.py +3 -3
  24. followthemoney/types/checksum.py +3 -3
  25. followthemoney/types/country.py +19 -4
  26. followthemoney/types/date.py +13 -3
  27. followthemoney/types/entity.py +3 -3
  28. followthemoney/types/gender.py +6 -6
  29. followthemoney/types/identifier.py +8 -8
  30. followthemoney/types/ip.py +3 -3
  31. followthemoney/types/json.py +2 -2
  32. followthemoney/types/language.py +3 -3
  33. followthemoney/types/mimetype.py +3 -3
  34. followthemoney/types/name.py +3 -3
  35. followthemoney/types/number.py +2 -2
  36. followthemoney/types/phone.py +3 -3
  37. followthemoney/types/string.py +2 -2
  38. followthemoney/types/topic.py +6 -3
  39. followthemoney/types/url.py +3 -3
  40. followthemoney/util.py +6 -14
  41. {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/METADATA +3 -3
  42. {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/RECORD +45 -44
  43. {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/WHEEL +1 -1
  44. {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/entry_points.txt +0 -0
  45. {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -9,7 +9,7 @@ from followthemoney.statement import Statement, StatementEntity, SE
9
9
  from followthemoney.dataset import Dataset, DefaultDataset, DS
10
10
  from followthemoney.util import set_model_locale
11
11
 
12
- __version__ = "4.3.0"
12
+ __version__ = "4.5.0"
13
13
 
14
14
  # Data model singleton
15
15
  model = Model.instance()
followthemoney/compare.py CHANGED
@@ -77,6 +77,12 @@ def compare(
77
77
  weights: Weights = COMPARE_WEIGHTS,
78
78
  ) -> float:
79
79
  """Compare two entities and return a match score."""
80
+ if left.checksum == right.checksum:
81
+ # Check if there is any data at all (ie any basis for making a decision),
82
+ # if so, return a perfect match. This avoids marking two empty entities
83
+ # as matching. Bit ambiguous, but practical.
84
+ if len(left.properties) > 0 and len(right.properties) > 0:
85
+ return 1.0
80
86
  scores = compare_scores(left, right)
81
87
  return _compare(scores, weights)
82
88
 
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  import yaml
2
3
  import logging
3
4
  from functools import cached_property
@@ -38,6 +39,8 @@ class DatasetModel(BaseModel):
38
39
  coverage: DataCoverage | None = None
39
40
  resources: List[DataResource] = []
40
41
  children: Set[str] = set()
42
+ deprecation: Optional[str] = None
43
+ deprecated: bool = False
41
44
 
42
45
  @field_validator("name", mode="after")
43
46
  @classmethod
@@ -57,6 +60,18 @@ class DatasetModel(BaseModel):
57
60
  data["children"] = children
58
61
  return data
59
62
 
63
+ @model_validator(mode="after")
64
+ def evaluate_data(self) -> "DatasetModel":
65
+ # derive deprecated from deprecation notice:
66
+ if self.deprecation is not None:
67
+ self.deprecation = self.deprecation.strip()
68
+ if not len(self.deprecation):
69
+ self.deprecation = None
70
+ self.deprecated = self.deprecation is not None or self.deprecated
71
+ if self.deprecated and (self.coverage is None or self.coverage.end is None):
72
+ raise ValueError("Deprecated dataset coverage must have an end date.")
73
+ return self
74
+
60
75
  def get_resource(self, name: str) -> DataResource:
61
76
  for res in self.resources:
62
77
  if res.name == name:
@@ -121,10 +136,13 @@ class Dataset:
121
136
  ) -> DS:
122
137
  from followthemoney.dataset.catalog import DataCatalog
123
138
 
139
+ path = Path(path)
124
140
  with open(path, "r") as fh:
125
141
  data = yaml.safe_load(fh)
126
142
  if catalog is None:
127
143
  catalog = DataCatalog(cls, {})
144
+ if "name" not in data:
145
+ data["name"] = path.stem
128
146
  return catalog.make_dataset(data)
129
147
 
130
148
  @classmethod
followthemoney/entity.py CHANGED
@@ -5,6 +5,7 @@ from rigour.names import pick_name
5
5
  from followthemoney.proxy import EntityProxy
6
6
  from followthemoney.schema import Schema
7
7
  from followthemoney.statement import BASE_ID, Statement
8
+ from followthemoney.util import HASH_ENCODING
8
9
 
9
10
  VE = TypeVar("VE", bound="ValueEntity")
10
11
 
@@ -42,25 +43,28 @@ class ValueEntity(EntityProxy):
42
43
  key_prefix: Optional[str] = None,
43
44
  cleaned: bool = True,
44
45
  ):
46
+ self._caption: Optional[str] = data.pop("caption", None)
47
+ self.datasets: Set[str] = set(data.pop("datasets", []))
48
+ self.referents: Set[str] = set(data.pop("referents", []))
49
+ self.first_seen: Optional[str] = data.pop("first_seen", None)
50
+ self.last_seen: Optional[str] = data.pop("last_seen", None)
51
+ self.last_change: Optional[str] = data.pop("last_change", None)
45
52
  super().__init__(schema, data, key_prefix=key_prefix, cleaned=cleaned)
46
- self._caption: Optional[str] = data.get("caption")
47
- self.datasets: Set[str] = set(data.get("datasets", []))
48
- self.referents: Set[str] = set(data.get("referents", []))
49
- self.first_seen: Optional[str] = data.get("first_seen")
50
- self.last_seen: Optional[str] = data.get("last_seen")
51
- self.last_change: Optional[str] = data.get("last_change")
52
53
 
53
54
  # add data from statement dict if present.
54
55
  # this updates the dataset and referents set
55
56
  for stmt_data in data.pop("statements", []):
56
57
  stmt = Statement.from_dict(stmt_data)
58
+ prop = schema.get(stmt.prop)
59
+ if prop is None:
60
+ continue
57
61
  self.datasets.add(stmt.dataset)
58
62
  if stmt.schema != self.schema.name:
59
63
  self.schema = schema.model.common_schema(self.schema, stmt.schema)
60
64
  if stmt.entity_id != self.id:
61
65
  self.referents.add(stmt.entity_id)
62
66
  if stmt.prop != BASE_ID:
63
- self.add(stmt.prop, stmt.value)
67
+ self.unsafe_add(prop, stmt.value, cleaned=cleaned)
64
68
 
65
69
  def merge(self: VE, other: EntityProxy) -> VE:
66
70
  merged = super().merge(other)
@@ -78,15 +82,25 @@ class ValueEntity(EntityProxy):
78
82
  merged.last_change = max(changed, default=None)
79
83
  return merged
80
84
 
85
+ @property
86
+ def checksum(self) -> str:
87
+ digest = self._checksum_digest()
88
+ for dataset in sorted(self.datasets):
89
+ digest.update(dataset.encode(HASH_ENCODING))
90
+ digest.update(b"\x1e")
91
+ for referent in sorted(self.referents):
92
+ digest.update(referent.encode(HASH_ENCODING))
93
+ digest.update(b"\x1e")
94
+ if self.last_change is not None:
95
+ digest.update(self.last_change.encode(HASH_ENCODING))
96
+ return digest.hexdigest()
97
+
81
98
  def to_dict(self) -> Dict[str, Any]:
82
- data: Dict[str, Any] = {
83
- "id": self.id,
84
- "caption": self._caption or self.caption,
85
- "schema": self.schema.name,
86
- "properties": self.properties,
87
- "referents": list(self.referents),
88
- "datasets": list(self.datasets),
89
- }
99
+ data = super().to_dict()
100
+ data["referents"] = list(self.referents)
101
+ data["datasets"] = list(self.datasets)
102
+ if self._caption is not None:
103
+ data["caption"] = self._caption
90
104
  if self.first_seen is not None:
91
105
  data["first_seen"] = self.first_seen
92
106
  if self.last_seen is not None:
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, cast
9
9
  from typing import Any, Dict, Generator, ItemsView, Iterable, List, Optional, Set, Tuple
10
10
 
11
11
  from followthemoney.mapping.source import Record, Source
12
+ from followthemoney.settings import USER_AGENT
12
13
  from followthemoney.util import sanitize_text
13
14
  from followthemoney.exc import InvalidMapping
14
15
 
@@ -64,7 +65,8 @@ class CSVSource(Source):
64
65
  parsed_url = urlparse(url)
65
66
  log.info("Loading: %s", url)
66
67
  if parsed_url.scheme in ["http", "https"]:
67
- res = requests.get(url, stream=True)
68
+ headers = {"User-Agent": USER_AGENT}
69
+ res = requests.get(url, stream=True, headers=headers)
68
70
  if not res.ok:
69
71
  raise InvalidMapping("Failed to open CSV: %s" % url)
70
72
  # if res.encoding is None:
followthemoney/model.py CHANGED
@@ -3,12 +3,15 @@ import yaml
3
3
  from functools import cache
4
4
  from typing import TYPE_CHECKING, Any
5
5
  from typing import Dict, Generator, Iterator, Optional, Set, TypedDict, Union
6
+ from rigour.env import ENCODING
6
7
 
7
8
  from followthemoney.types import registry
8
9
  from followthemoney.types.common import PropertyType, PropertyTypeToDict
9
10
  from followthemoney.schema import Schema, SchemaToDict
10
11
  from followthemoney.property import Property
11
12
  from followthemoney.exc import InvalidModel, InvalidData
13
+ from followthemoney.settings import MODEL_PATH
14
+ from followthemoney.util import const
12
15
 
13
16
  if TYPE_CHECKING:
14
17
  from followthemoney.proxy import EntityProxy
@@ -46,10 +49,7 @@ class Model(object):
46
49
  @classmethod
47
50
  def instance(cls) -> "Model":
48
51
  if cls._instance is None:
49
- model_path = os.path.dirname(__file__)
50
- model_path = os.path.join(model_path, "schema")
51
- model_path = os.environ.get("FTM_MODEL_PATH", model_path)
52
- cls._instance = cls(model_path)
52
+ cls._instance = cls(MODEL_PATH)
53
53
  return cls._instance
54
54
 
55
55
  def generate(self) -> None:
@@ -67,11 +67,12 @@ class Model(object):
67
67
  schema.properties[prop.name] = prop
68
68
 
69
69
  def _load(self, filepath: str) -> None:
70
- with open(filepath, "r", encoding="utf-8") as fh:
70
+ with open(filepath, "r", encoding=ENCODING) as fh:
71
71
  data = yaml.safe_load(fh)
72
72
  if not isinstance(data, dict):
73
73
  raise InvalidModel("Model file is not a mapping: %s" % filepath)
74
74
  for name, config in data.items():
75
+ name = const(name)
75
76
  self.schemata[name] = Schema(self, name, config)
76
77
 
77
78
  def get(self, name: Union[str, Schema]) -> Optional[Schema]:
@@ -1,8 +1,9 @@
1
1
  import re
2
2
  from banal import is_mapping, as_bool
3
+ from rigour.ids import get_identifier_format
3
4
  from typing import TYPE_CHECKING, Any, List, Optional, TypedDict
4
5
 
5
- from followthemoney.exc import InvalidModel
6
+ from followthemoney.exc import InvalidData, InvalidModel
6
7
  from followthemoney.types import registry
7
8
  from followthemoney.util import gettext, get_entity_id, const
8
9
 
@@ -86,17 +87,16 @@ class Property:
86
87
  self.schema = schema
87
88
 
88
89
  #: Machine-readable name for this property.
89
- self.name = const(name)
90
+ self.name = name
90
91
  if not check_property_name(self.name):
91
92
  raise InvalidModel("Invalid name: %s" % self.name)
92
93
 
93
94
  #: Qualified property name, which also includes the schema name.
94
95
  self.qname = const("%s:%s" % (schema.name, self.name))
95
96
 
96
- self._hash = hash("<Property(%r)>" % self.qname)
97
-
98
97
  self._label = data.get("label", name)
99
98
  self._description = data.get("description")
99
+ self._hash = hash("<Property(%r)>" % self.qname)
100
100
 
101
101
  #: This property is deprecated and should not be used.
102
102
  self.deprecated = as_bool(data.get("deprecated", False))
@@ -157,6 +157,13 @@ class Property:
157
157
  raise InvalidModel("Invalid reverse: %s" % self)
158
158
  self.reverse = self.range._add_reverse(model, self._reverse, self)
159
159
 
160
+ if self.type == registry.identifier and self.format is not None:
161
+ format_ = get_identifier_format(self.format)
162
+ if format_ is None or format_.NAME != self.format:
163
+ raise InvalidModel("Invalid identifier format: %s" % self.format)
164
+ # Internalize the string:
165
+ self.format = format_.NAME
166
+
160
167
  @property
161
168
  def label(self) -> str:
162
169
  """User-facing title for this property."""
@@ -229,6 +236,18 @@ class Property:
229
236
  data["format"] = self.format
230
237
  return data
231
238
 
239
+ def __reduce__(self) -> Any:
240
+ return (self._reconstruct, (self.qname,))
241
+
242
+ @classmethod
243
+ def _reconstruct(cls, qname: str) -> "Property":
244
+ from followthemoney.model import Model
245
+
246
+ prop = Model.instance().get_qname(qname)
247
+ if prop is None:
248
+ raise InvalidData("Unknown property: %r" % qname)
249
+ return prop
250
+
232
251
  def __repr__(self) -> str:
233
252
  return "<Property(%r)>" % self.qname
234
253
 
followthemoney/proxy.py CHANGED
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  from typing import TYPE_CHECKING, cast, Any
3
4
  from typing import Dict, Generator, List, Optional, Set, Tuple, Union, Type, TypeVar
@@ -10,13 +11,14 @@ from followthemoney.types import registry
10
11
  from followthemoney.types.common import PropertyType
11
12
  from followthemoney.property import Property
12
13
  from followthemoney.value import string_list, Values
13
- from followthemoney.util import sanitize_text, gettext
14
+ from followthemoney.util import HASH_ENCODING, sanitize_text, gettext
14
15
  from followthemoney.util import merge_context, make_entity_id
15
16
  from followthemoney.model import Model
16
17
  from followthemoney.schema import Schema
17
18
 
18
19
  if TYPE_CHECKING:
19
20
  from followthemoney.model import Model
21
+ from hashlib import _Hash
20
22
 
21
23
  log = logging.getLogger(__name__)
22
24
  P = Union[Property, str]
@@ -403,13 +405,10 @@ class EntityProxy(object):
403
405
  schema and any contextual values that were handed in initially. The resulting
404
406
  dictionary can be used to make a new proxy, and it is commonly written to disk
405
407
  or a database."""
406
- data = dict(self.context)
407
- extra = {
408
- "id": self.id,
409
- "schema": self.schema.name,
410
- "properties": self.properties,
411
- }
412
- data.update(extra)
408
+ data: Dict[str, Any] = dict(self.context)
409
+ data["id"] = self.id
410
+ data["schema"] = self.schema.name
411
+ data["properties"] = self.properties
413
412
  return data
414
413
 
415
414
  def to_full_dict(self, matchable: bool = False) -> Dict[str, Any]:
@@ -440,6 +439,28 @@ class EntityProxy(object):
440
439
  self.add(prop, values, cleaned=True, quiet=True)
441
440
  return self
442
441
 
442
+ def _checksum_digest(self) -> "_Hash":
443
+ """Create a SHA1 digest of the entity's ID, schema and properties for
444
+ change detection. This is returned as a hashlib digest object so that
445
+ it can be subclassed."""
446
+ digest = hashlib.sha1()
447
+ if self.id is not None:
448
+ digest.update(self.id.encode(HASH_ENCODING))
449
+ digest.update(self.schema.name.encode(HASH_ENCODING))
450
+ for prop in sorted(self._properties.keys()):
451
+ digest.update(prop.encode(HASH_ENCODING))
452
+ for value in sorted(self._properties[prop]):
453
+ digest.update(value.encode(HASH_ENCODING))
454
+ digest.update(b"\x1e")
455
+ digest.update(b"\x1f")
456
+ return digest
457
+
458
+ @property
459
+ def checksum(self) -> str:
460
+ """A SHA1 checksum hexdigest representing the current state of the
461
+ entity proxy. This can be used for change detection."""
462
+ return self._checksum_digest().hexdigest()
463
+
443
464
  def __getstate__(self) -> Dict[str, Any]:
444
465
  data = {slot: getattr(self, slot) for slot in self.__slots__}
445
466
  data["schema"] = self.schema.name
@@ -462,14 +483,14 @@ class EntityProxy(object):
462
483
  return self._size
463
484
 
464
485
  def __hash__(self) -> int:
465
- if not self.id:
466
- raise RuntimeError("Cannot hash entity without an ID")
486
+ if self.id is None:
487
+ raise RuntimeError("Unhashable entity proxy without ID.")
467
488
  return hash(self.id)
468
489
 
469
490
  def __eq__(self, other: Any) -> bool:
470
491
  try:
471
492
  if self.id is None or other.id is None:
472
- raise RuntimeError("Cannot compare entities without IDs.")
493
+ raise RuntimeError("Cannot compare entity proxies without IDs.")
473
494
  return bool(self.id == other.id)
474
495
  except AttributeError:
475
496
  return False
@@ -18,6 +18,11 @@ Company:
18
18
  - name
19
19
  caption:
20
20
  - name
21
+ - alias
22
+ - abbreviation
23
+ - weakAlias
24
+ - previousName
25
+ - registrationNumber
21
26
  properties:
22
27
  jurisdiction:
23
28
  label: Jurisdiction
@@ -26,6 +26,10 @@ CryptoWallet:
26
26
  maxLength: 128
27
27
  privateKey:
28
28
  label: Private key
29
+ accountId:
30
+ label: Account ID
31
+ description: Platform-specific user/account identifier
32
+ type: identifier
29
33
  creationDate:
30
34
  label: Creation date
31
35
  type: date
@@ -1,4 +1,7 @@
1
1
  Image:
2
+ # This schema defines an image file entity within the FollowTheMoney data model.
3
+ # If a `checksum` property is present, consider loading it from an Aleph archive
4
+ # or FtM data lake. Otherwise, use `sourceUrl` to fetch the image directly.
2
5
  extends:
3
6
  - Document
4
7
  label: Image
@@ -23,3 +26,7 @@ Image:
23
26
  label: "Images"
24
27
  type: entity
25
28
  range: Person
29
+ credit:
30
+ label: "Credit"
31
+ description: "The credit or attribution for the image."
32
+ type: string
@@ -17,6 +17,10 @@ LegalEntity:
17
17
  - name
18
18
  caption:
19
19
  - name
20
+ - alias
21
+ - abbreviation
22
+ - weakAlias
23
+ - previousName
20
24
  - email
21
25
  - phone
22
26
  - registrationNumber
@@ -26,6 +30,12 @@ LegalEntity:
26
30
  end:
27
31
  - dissolutionDate
28
32
  properties:
33
+ abbreviation:
34
+ label: Abbreviation
35
+ type: name
36
+ description: "Abbreviated name or acronym"
37
+ # TODO: is un-matchable wise? The idea is to handle it like `weakAlias` rather than `alias`.
38
+ matchable: false
29
39
  email:
30
40
  label: E-Mail
31
41
  type: email
@@ -17,6 +17,11 @@ Organization:
17
17
  - name
18
18
  caption:
19
19
  - name
20
+ - alias
21
+ - abbreviation
22
+ - weakAlias
23
+ - previousName
24
+ - registrationNumber
20
25
  properties:
21
26
  cageCode:
22
27
  label: CAGE
@@ -14,6 +14,10 @@ Person:
14
14
  - name
15
15
  caption:
16
16
  - name
17
+ - alias
18
+ - previousName
19
+ - weakAlias
20
+ - abbreviation
17
21
  - lastName
18
22
  - email
19
23
  - phone
@@ -13,5 +13,9 @@ PublicBody:
13
13
  - status
14
14
  caption:
15
15
  - name
16
+ - alias
17
+ - abbreviation
18
+ - weakAlias
19
+ - previousName
16
20
  required:
17
21
  - name
@@ -24,7 +24,7 @@ Thing:
24
24
  label: Country
25
25
  type: country
26
26
  alias:
27
- label: Other name
27
+ label: Alias
28
28
  type: name
29
29
  previousName:
30
30
  label: Previous name
@@ -32,6 +32,7 @@ Thing:
32
32
  weakAlias:
33
33
  label: Weak alias
34
34
  type: name
35
+ description: "A relatively broad or generic alias that should not be used for matching in screening systems. It may still may be useful for identification purposes, particularly in confirming a possible match triggered by other identifier information."
35
36
  matchable: false
36
37
  sourceUrl:
37
38
  label: Source link
@@ -55,7 +56,7 @@ Thing:
55
56
  wikidataId:
56
57
  label: Wikidata ID
57
58
  type: identifier
58
- format: qid
59
+ format: wikidata
59
60
  maxLength: 32
60
61
  keywords:
61
62
  label: Keywords
followthemoney/schema.py CHANGED
@@ -106,7 +106,7 @@ class Schema:
106
106
 
107
107
  def __init__(self, model: "Model", name: str, data: SchemaSpec) -> None:
108
108
  #: Machine-readable name of the schema, used for identification.
109
- self.name = const(name)
109
+ self.name = name
110
110
  self.model = model
111
111
  self._label = data.get("label", name)
112
112
  self._plural = data.get("plural", self.label)
@@ -191,6 +191,7 @@ class Schema:
191
191
  #: inherited from parent schemata.
192
192
  self.properties: Dict[str, Property] = {}
193
193
  for pname, prop in data.get("properties", {}).items():
194
+ pname = const(pname)
194
195
  self.properties[pname] = Property(self, pname, prop)
195
196
 
196
197
  def generate(self, model: "Model") -> None:
@@ -264,6 +265,7 @@ class Schema:
264
265
  name = data.get("name")
265
266
  if name is None:
266
267
  raise InvalidModel("Unnamed reverse: %s" % other)
268
+ name = const(name)
267
269
 
268
270
  prop = self.get(name)
269
271
  if prop is None:
@@ -272,7 +274,7 @@ class Schema:
272
274
  "type": registry.entity.name,
273
275
  "reverse": {"name": other.name},
274
276
  "range": other.schema.name,
275
- "hidden": data.get("hidden", other.hidden),
277
+ "hidden": as_bool(data.get("hidden", other.hidden)),
276
278
  }
277
279
  prop = Property(self, name, spec)
278
280
  prop.stub = True
@@ -466,6 +468,18 @@ class Schema:
466
468
  data["properties"] = properties
467
469
  return data
468
470
 
471
+ def __reduce__(self) -> Any:
472
+ return (self._reconstruct, (self.name,))
473
+
474
+ @classmethod
475
+ def _reconstruct(cls, name: str) -> "Schema":
476
+ from followthemoney.model import Model
477
+
478
+ schema = Model.instance().get(name)
479
+ if schema is None:
480
+ raise InvalidData("Unknown schema: %r" % name)
481
+ return schema
482
+
469
483
  def __eq__(self, other: Any) -> bool:
470
484
  """Compare two schemata (via hash)."""
471
485
  try:
@@ -0,0 +1,19 @@
1
+ import os
2
+ import requests
3
+ from typing import List
4
+ from rigour.env import env_opt, env_str
5
+
6
+
7
+ def get_env_list(name: str, default: List[str] = []) -> List[str]:
8
+ value = env_opt(name)
9
+ if value is not None:
10
+ values = value.split(":")
11
+ if len(values):
12
+ return values
13
+ return default
14
+
15
+
16
+ MODEL_PATH = os.path.join(os.path.dirname(__file__), "schema")
17
+ MODEL_PATH = env_str("FTM_MODEL_PATH", MODEL_PATH)
18
+
19
+ USER_AGENT = env_str("FTM_USER_AGENT", requests.utils.default_user_agent())
@@ -1,6 +1,6 @@
1
1
  from hashlib import sha1
2
2
  from collections.abc import Mapping
3
- from typing import Any, Dict, List, Optional, Set, Type
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type
4
4
  from typing import Generator, Iterable, Tuple, TypeVar
5
5
  from rigour.langs import LangStr
6
6
  from rigour.names.pick import pick_lang_name
@@ -10,7 +10,7 @@ from followthemoney.exc import InvalidData
10
10
  from followthemoney.schema import Schema
11
11
  from followthemoney.types.common import PropertyType
12
12
  from followthemoney.property import Property
13
- from followthemoney.util import gettext
13
+ from followthemoney.util import HASH_ENCODING, gettext
14
14
  from followthemoney.proxy import P
15
15
  from followthemoney.types import registry
16
16
  from followthemoney.value import string_list, Values
@@ -21,6 +21,9 @@ from followthemoney.statement.util import BASE_ID
21
21
 
22
22
  SE = TypeVar("SE", bound="StatementEntity")
23
23
 
24
+ if TYPE_CHECKING:
25
+ from hashlib import _Hash
26
+
24
27
 
25
28
  class StatementEntity(EntityProxy):
26
29
  """An entity object that can link to a set of datasets that it is sourced from."""
@@ -35,7 +38,12 @@ class StatementEntity(EntityProxy):
35
38
  "_statements",
36
39
  )
37
40
 
38
- def __init__(self, dataset: Dataset, data: Dict[str, Any], cleaned: bool = True):
41
+ def __init__(
42
+ self,
43
+ dataset: Dataset,
44
+ data: Dict[str, Any],
45
+ cleaned: bool = True,
46
+ ) -> None:
39
47
  data = dict(data or {})
40
48
  schema = Model.instance().get(data.pop("schema", None))
41
49
  if schema is None:
@@ -76,8 +84,7 @@ class StatementEntity(EntityProxy):
76
84
  for stmts in self._statements.values():
77
85
  for stmt in stmts:
78
86
  if stmt.entity_id is None and self.id is not None:
79
- stmt.entity_id = self.id
80
- stmt.id = stmt.generate_key()
87
+ stmt = stmt.clone(entity_id=self.id)
81
88
  if stmt.id is None:
82
89
  stmt.id = stmt.generate_key()
83
90
  yield stmt
@@ -97,9 +104,9 @@ class StatementEntity(EntityProxy):
97
104
  if stmt.first_seen is not None:
98
105
  first_seen.add(stmt.first_seen)
99
106
  if self.id is not None:
100
- digest = sha1(self.schema.name.encode("utf-8"))
107
+ digest = sha1(self.schema.name.encode(HASH_ENCODING))
101
108
  for id in sorted(ids):
102
- digest.update(id.encode("utf-8"))
109
+ digest.update(id.encode(HASH_ENCODING))
103
110
  checksum = digest.hexdigest()
104
111
  # This is to make the last_change value stable across
105
112
  # serialisation:
@@ -449,6 +456,23 @@ class StatementEntity(EntityProxy):
449
456
  data["statements"] = [stmt.to_dict() for stmt in self.statements]
450
457
  return data
451
458
 
459
+ def _checksum_digest(self) -> "_Hash":
460
+ """Create a SHA1 digest of the entity's ID, schema and properties for
461
+ change detection. This is returned as a hashlib digest object so that
462
+ it can be subclassed."""
463
+ digest = sha1()
464
+ if self.id is not None:
465
+ digest.update(self.id.encode(HASH_ENCODING))
466
+ statement_ids: List[str] = []
467
+ for stmts in self._statements.values():
468
+ for stmt in stmts:
469
+ if stmt.id is not None:
470
+ statement_ids.append(stmt.id)
471
+ for stmt_id in sorted(statement_ids):
472
+ digest.update(stmt_id.encode(HASH_ENCODING))
473
+ digest.update(b"\x1e")
474
+ return digest
475
+
452
476
  def __len__(self) -> int:
453
477
  return len(list(self._iter_stmt())) + 1
454
478