followthemoney 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,14 +2,14 @@ from followthemoney.entity import ValueEntity, VE
2
2
  from followthemoney.model import Model
3
3
  from followthemoney.schema import Schema
4
4
  from followthemoney.property import Property
5
- from followthemoney.types import registry
5
+ from followthemoney.types import registry, PropertyType
6
6
  from followthemoney.value import Value, Values
7
7
  from followthemoney.proxy import EntityProxy, E
8
8
  from followthemoney.statement import Statement, StatementEntity, SE
9
- from followthemoney.dataset import Dataset, DefaultDataset, DS
9
+ from followthemoney.dataset import Dataset, UndefinedDataset, DS
10
10
  from followthemoney.util import set_model_locale
11
11
 
12
- __version__ = "4.3.4"
12
+ __version__ = "4.5.1"
13
13
 
14
14
  # Data model singleton
15
15
  model = Model.instance()
@@ -20,13 +20,14 @@ __all__ = [
20
20
  "Model",
21
21
  "Schema",
22
22
  "Property",
23
+ "PropertyType",
23
24
  "Value",
24
25
  "Values",
25
26
  "EntityProxy",
26
27
  "E",
27
28
  "registry",
28
29
  "Dataset",
29
- "DefaultDataset",
30
+ "UndefinedDataset",
30
31
  "DS",
31
32
  "Statement",
32
33
  "StatementEntity",
@@ -1,12 +1,12 @@
1
1
  import click
2
2
  from pathlib import Path
3
- from typing import Generator, List
3
+ from typing import Generator, List, Optional
4
4
 
5
5
 
6
6
  from followthemoney.cli.cli import cli
7
7
  from followthemoney.cli.util import InPath, OutPath
8
8
  from followthemoney.cli.util import path_entities, write_entity, path_writer
9
- from followthemoney.dataset import Dataset, DefaultDataset
9
+ from followthemoney.dataset import Dataset, UndefinedDataset
10
10
  from followthemoney.statement import Statement, StatementEntity
11
11
  from followthemoney.statement import FORMATS, CSV
12
12
  from followthemoney.statement import write_statements
@@ -16,12 +16,18 @@ from followthemoney.statement import read_path_statements
16
16
  @cli.command("statements", help="Export entities to statements")
17
17
  @click.argument("path", type=InPath)
18
18
  @click.option("-o", "--outpath", type=OutPath, default="-")
19
- @click.option("-d", "--dataset", type=str, required=True)
19
+ @click.option("-d", "--dataset", type=str)
20
20
  @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
21
- def entity_statements(path: Path, outpath: Path, dataset: str, format: str) -> None:
21
+ def entity_statements(
22
+ path: Path, outpath: Path, dataset: Optional[str], format: str
23
+ ) -> None:
22
24
  def make_statements() -> Generator[Statement, None, None]:
25
+ dataset_ = dataset or Dataset.UNDEFINED
23
26
  for entity in path_entities(path, StatementEntity):
24
- yield from Statement.from_entity(entity, dataset=dataset)
27
+ for stmt in Statement.from_entity(entity, dataset=dataset_):
28
+ if dataset is not None:
29
+ stmt = stmt.clone(dataset=dataset)
30
+ yield stmt
25
31
 
26
32
  with path_writer(outpath) as outfh:
27
33
  write_statements(outfh, format, make_statements())
@@ -43,12 +49,12 @@ def format_statements(
43
49
  @cli.command("aggregate-statements", help="Roll up statements into entities")
44
50
  @click.option("-i", "--infile", type=InPath, default="-")
45
51
  @click.option("-o", "--outpath", type=OutPath, default="-")
46
- @click.option("-d", "--dataset", type=str, default=DefaultDataset.name)
52
+ @click.option("-d", "--dataset", type=str, default=UndefinedDataset.name)
47
53
  @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
48
54
  def statements_aggregate(
49
55
  infile: Path, outpath: Path, dataset: str, format: str
50
56
  ) -> None:
51
- dataset_ = Dataset.make({"name": dataset, "title": dataset})
57
+ dataset_ = Dataset.make({"name": dataset})
52
58
  with path_writer(outpath) as outfh:
53
59
  statements: List[Statement] = []
54
60
  for stmt in read_path_statements(infile, format=format):
@@ -6,7 +6,7 @@ import click
6
6
  import orjson
7
7
  from pathlib import Path
8
8
  from warnings import warn
9
- from typing import Any, BinaryIO, Generator, Optional, TextIO, Type
9
+ from typing import Any, BinaryIO, Generator, List, Optional, TextIO, Type
10
10
  from banal import is_mapping, is_listish, ensure_list
11
11
 
12
12
  from followthemoney.export.common import Exporter
@@ -26,7 +26,7 @@ def write_object(stream: TextIO, obj: Any) -> None:
26
26
  stream.write(data + "\n")
27
27
 
28
28
 
29
- def write_entity(fh: BinaryIO, entity: E) -> None:
29
+ def write_entity(fh: BinaryIO, entity: EntityProxy) -> None:
30
30
  data = entity.to_dict()
31
31
  entity_id = data.pop("id")
32
32
  assert entity_id is not None, data
@@ -131,7 +131,7 @@ def resolve_includes(file_path: PathLike, data: Any) -> Any:
131
131
  if is_listish(data):
132
132
  return [resolve_includes(file_path, i) for i in data]
133
133
  if is_mapping(data):
134
- include_paths = ensure_list(data.pop("include", []))
134
+ include_paths: List[str] = ensure_list(data.pop("include", []))
135
135
  for include_path in include_paths:
136
136
  dir_prefix = os.path.dirname(file_path)
137
137
  include_path = os.path.join(dir_prefix, include_path)
followthemoney/compare.py CHANGED
@@ -71,31 +71,18 @@ def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
71
71
  return 1.0 / (1.0 + math.exp(-prob))
72
72
 
73
73
 
74
- def entity_is_same(left: EntityProxy, right: EntityProxy) -> bool:
75
- """Check if two entities are the same apart from their ID."""
76
- if left.schema != right.schema:
77
- return False
78
-
79
- props = set(left.properties.keys()).union(right.properties.keys())
80
- if 0 == len(props):
81
- return False
82
-
83
- for prop in props:
84
- left_vals = sorted(left.get(prop))
85
- right_vals = sorted(right.get(prop))
86
- if left_vals != right_vals:
87
- return False
88
- return True
89
-
90
-
91
74
  def compare(
92
75
  left: EntityProxy,
93
76
  right: EntityProxy,
94
77
  weights: Weights = COMPARE_WEIGHTS,
95
78
  ) -> float:
96
79
  """Compare two entities and return a match score."""
97
- if entity_is_same(left, right):
98
- return 1.0
80
+ if left.checksum == right.checksum:
81
+ # Check if there is any data at all (ie any basis for making a decision),
82
+ # if so, return a perfect match. This avoids marking two empty entities
83
+ # as matching. Bit ambiguous, but practical.
84
+ if len(left.properties) > 0 and len(right.properties) > 0:
85
+ return 1.0
99
86
  scores = compare_scores(left, right)
100
87
  return _compare(scores, weights)
101
88
 
@@ -4,11 +4,11 @@ from followthemoney.dataset.resource import DataResource
4
4
  from followthemoney.dataset.publisher import DataPublisher
5
5
  from followthemoney.dataset.coverage import DataCoverage
6
6
 
7
- DefaultDataset = Dataset.make({"name": "default"})
7
+ UndefinedDataset = Dataset.make({"name": Dataset.UNDEFINED})
8
8
 
9
9
  __all__ = [
10
10
  "Dataset",
11
- "DefaultDataset",
11
+ "UndefinedDataset",
12
12
  "DataCatalog",
13
13
  "DataResource",
14
14
  "DataPublisher",
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  import yaml
2
3
  import logging
3
4
  from functools import cached_property
@@ -38,6 +39,8 @@ class DatasetModel(BaseModel):
38
39
  coverage: DataCoverage | None = None
39
40
  resources: List[DataResource] = []
40
41
  children: Set[str] = set()
42
+ deprecation: Optional[str] = None
43
+ deprecated: bool = False
41
44
 
42
45
  @field_validator("name", mode="after")
43
46
  @classmethod
@@ -57,6 +60,18 @@ class DatasetModel(BaseModel):
57
60
  data["children"] = children
58
61
  return data
59
62
 
63
+ @model_validator(mode="after")
64
+ def evaluate_data(self) -> "DatasetModel":
65
+ # derive deprecated from deprecation notice:
66
+ if self.deprecation is not None:
67
+ self.deprecation = self.deprecation.strip()
68
+ if not len(self.deprecation):
69
+ self.deprecation = None
70
+ self.deprecated = self.deprecation is not None or self.deprecated
71
+ if self.deprecated and (self.coverage is None or self.coverage.end is None):
72
+ raise ValueError("Deprecated dataset coverage must have an end date.")
73
+ return self
74
+
60
75
  def get_resource(self, name: str) -> DataResource:
61
76
  for res in self.resources:
62
77
  if res.name == name:
@@ -68,6 +83,8 @@ class Dataset:
68
83
  """A container for entities, often from one source or related to one topic.
69
84
  A dataset is a set of data, sez W3C."""
70
85
 
86
+ UNDEFINED = "undefined"
87
+
71
88
  def __init__(self: Self, data: Dict[str, Any]) -> None:
72
89
  self.model = DatasetModel.model_validate(data)
73
90
  self.name = self.model.name
@@ -121,10 +138,13 @@ class Dataset:
121
138
  ) -> DS:
122
139
  from followthemoney.dataset.catalog import DataCatalog
123
140
 
141
+ path = Path(path)
124
142
  with open(path, "r") as fh:
125
143
  data = yaml.safe_load(fh)
126
144
  if catalog is None:
127
145
  catalog = DataCatalog(cls, {})
146
+ if "name" not in data:
147
+ data["name"] = path.stem
128
148
  return catalog.make_dataset(data)
129
149
 
130
150
  @classmethod
followthemoney/entity.py CHANGED
@@ -5,6 +5,7 @@ from rigour.names import pick_name
5
5
  from followthemoney.proxy import EntityProxy
6
6
  from followthemoney.schema import Schema
7
7
  from followthemoney.statement import BASE_ID, Statement
8
+ from followthemoney.util import HASH_ENCODING
8
9
 
9
10
  VE = TypeVar("VE", bound="ValueEntity")
10
11
 
@@ -81,6 +82,19 @@ class ValueEntity(EntityProxy):
81
82
  merged.last_change = max(changed, default=None)
82
83
  return merged
83
84
 
85
+ @property
86
+ def checksum(self) -> str:
87
+ digest = self._checksum_digest()
88
+ for dataset in sorted(self.datasets):
89
+ digest.update(dataset.encode(HASH_ENCODING))
90
+ digest.update(b"\x1e")
91
+ for referent in sorted(self.referents):
92
+ digest.update(referent.encode(HASH_ENCODING))
93
+ digest.update(b"\x1e")
94
+ if self.last_change is not None:
95
+ digest.update(self.last_change.encode(HASH_ENCODING))
96
+ return digest.hexdigest()
97
+
84
98
  def to_dict(self) -> Dict[str, Any]:
85
99
  data = super().to_dict()
86
100
  data["referents"] = list(self.referents)
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, cast
9
9
  from typing import Any, Dict, Generator, ItemsView, Iterable, List, Optional, Set, Tuple
10
10
 
11
11
  from followthemoney.mapping.source import Record, Source
12
+ from followthemoney.settings import USER_AGENT
12
13
  from followthemoney.util import sanitize_text
13
14
  from followthemoney.exc import InvalidMapping
14
15
 
@@ -64,7 +65,8 @@ class CSVSource(Source):
64
65
  parsed_url = urlparse(url)
65
66
  log.info("Loading: %s", url)
66
67
  if parsed_url.scheme in ["http", "https"]:
67
- res = requests.get(url, stream=True)
68
+ headers = {"User-Agent": USER_AGENT}
69
+ res = requests.get(url, stream=True, headers=headers)
68
70
  if not res.ok:
69
71
  raise InvalidMapping("Failed to open CSV: %s" % url)
70
72
  # if res.encoding is None:
followthemoney/model.py CHANGED
@@ -3,12 +3,14 @@ import yaml
3
3
  from functools import cache
4
4
  from typing import TYPE_CHECKING, Any
5
5
  from typing import Dict, Generator, Iterator, Optional, Set, TypedDict, Union
6
+ from rigour.env import ENCODING
6
7
 
7
8
  from followthemoney.types import registry
8
9
  from followthemoney.types.common import PropertyType, PropertyTypeToDict
9
10
  from followthemoney.schema import Schema, SchemaToDict
10
11
  from followthemoney.property import Property
11
12
  from followthemoney.exc import InvalidModel, InvalidData
13
+ from followthemoney.settings import MODEL_PATH
12
14
  from followthemoney.util import const
13
15
 
14
16
  if TYPE_CHECKING:
@@ -47,10 +49,7 @@ class Model(object):
47
49
  @classmethod
48
50
  def instance(cls) -> "Model":
49
51
  if cls._instance is None:
50
- model_path = os.path.dirname(__file__)
51
- model_path = os.path.join(model_path, "schema")
52
- model_path = os.environ.get("FTM_MODEL_PATH", model_path)
53
- cls._instance = cls(model_path)
52
+ cls._instance = cls(MODEL_PATH)
54
53
  return cls._instance
55
54
 
56
55
  def generate(self) -> None:
@@ -68,7 +67,7 @@ class Model(object):
68
67
  schema.properties[prop.name] = prop
69
68
 
70
69
  def _load(self, filepath: str) -> None:
71
- with open(filepath, "r", encoding="utf-8") as fh:
70
+ with open(filepath, "r", encoding=ENCODING) as fh:
72
71
  data = yaml.safe_load(fh)
73
72
  if not isinstance(data, dict):
74
73
  raise InvalidModel("Model file is not a mapping: %s" % filepath)
followthemoney/proxy.py CHANGED
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  from typing import TYPE_CHECKING, cast, Any
3
4
  from typing import Dict, Generator, List, Optional, Set, Tuple, Union, Type, TypeVar
@@ -10,13 +11,14 @@ from followthemoney.types import registry
10
11
  from followthemoney.types.common import PropertyType
11
12
  from followthemoney.property import Property
12
13
  from followthemoney.value import string_list, Values
13
- from followthemoney.util import sanitize_text, gettext
14
+ from followthemoney.util import HASH_ENCODING, sanitize_text, gettext
14
15
  from followthemoney.util import merge_context, make_entity_id
15
16
  from followthemoney.model import Model
16
17
  from followthemoney.schema import Schema
17
18
 
18
19
  if TYPE_CHECKING:
19
20
  from followthemoney.model import Model
21
+ from hashlib import _Hash
20
22
 
21
23
  log = logging.getLogger(__name__)
22
24
  P = Union[Property, str]
@@ -437,6 +439,28 @@ class EntityProxy(object):
437
439
  self.add(prop, values, cleaned=True, quiet=True)
438
440
  return self
439
441
 
442
+ def _checksum_digest(self) -> "_Hash":
443
+ """Create a SHA1 digest of the entity's ID, schema and properties for
444
+ change detection. This is returned as a hashlib digest object so that
445
+ it can be subclassed."""
446
+ digest = hashlib.sha1()
447
+ if self.id is not None:
448
+ digest.update(self.id.encode(HASH_ENCODING))
449
+ digest.update(self.schema.name.encode(HASH_ENCODING))
450
+ for prop in sorted(self._properties.keys()):
451
+ digest.update(prop.encode(HASH_ENCODING))
452
+ for value in sorted(self._properties[prop]):
453
+ digest.update(value.encode(HASH_ENCODING))
454
+ digest.update(b"\x1e")
455
+ digest.update(b"\x1f")
456
+ return digest
457
+
458
+ @property
459
+ def checksum(self) -> str:
460
+ """A SHA1 checksum hexdigest representing the current state of the
461
+ entity proxy. This can be used for change detection."""
462
+ return self._checksum_digest().hexdigest()
463
+
440
464
  def __getstate__(self) -> Dict[str, Any]:
441
465
  data = {slot: getattr(self, slot) for slot in self.__slots__}
442
466
  data["schema"] = self.schema.name
@@ -460,13 +484,13 @@ class EntityProxy(object):
460
484
 
461
485
  def __hash__(self) -> int:
462
486
  if self.id is None:
463
- raise RuntimeError("Cannot hash entity without an ID")
487
+ raise RuntimeError("Unhashable entity proxy without ID.")
464
488
  return hash(self.id)
465
489
 
466
490
  def __eq__(self, other: Any) -> bool:
467
491
  try:
468
492
  if self.id is None or other.id is None:
469
- raise RuntimeError("Cannot compare entities without IDs.")
493
+ raise RuntimeError("Cannot compare entity proxies without IDs.")
470
494
  return bool(self.id == other.id)
471
495
  except AttributeError:
472
496
  return False
@@ -19,6 +19,7 @@ Company:
19
19
  caption:
20
20
  - name
21
21
  - alias
22
+ - abbreviation
22
23
  - weakAlias
23
24
  - previousName
24
25
  - registrationNumber
@@ -26,6 +26,10 @@ CryptoWallet:
26
26
  maxLength: 128
27
27
  privateKey:
28
28
  label: Private key
29
+ accountId:
30
+ label: Account ID
31
+ description: Platform-specific user/account identifier
32
+ type: identifier
29
33
  creationDate:
30
34
  label: Creation date
31
35
  type: date
@@ -1,4 +1,7 @@
1
1
  Image:
2
+ # This schema defines an image file entity within the FollowTheMoney data model.
3
+ # If a `checksum` property is present, consider loading it from an Aleph archive
4
+ # or FtM data lake. Otherwise, use `sourceUrl` to fetch the image directly.
2
5
  extends:
3
6
  - Document
4
7
  label: Image
@@ -23,3 +26,7 @@ Image:
23
26
  label: "Images"
24
27
  type: entity
25
28
  range: Person
29
+ credit:
30
+ label: "Credit"
31
+ description: "The credit or attribution for the image."
32
+ type: string
@@ -18,6 +18,7 @@ LegalEntity:
18
18
  caption:
19
19
  - name
20
20
  - alias
21
+ - abbreviation
21
22
  - weakAlias
22
23
  - previousName
23
24
  - email
@@ -29,6 +30,12 @@ LegalEntity:
29
30
  end:
30
31
  - dissolutionDate
31
32
  properties:
33
+ abbreviation:
34
+ label: Abbreviation
35
+ type: name
36
+ description: "Abbreviated name or acronym"
37
+ # TODO: is un-matchable wise? The idea is to handle it like `weakAlias` rather than `alias`.
38
+ matchable: false
32
39
  email:
33
40
  label: E-Mail
34
41
  type: email
@@ -18,6 +18,7 @@ Organization:
18
18
  caption:
19
19
  - name
20
20
  - alias
21
+ - abbreviation
21
22
  - weakAlias
22
23
  - previousName
23
24
  - registrationNumber
@@ -15,8 +15,9 @@ Person:
15
15
  caption:
16
16
  - name
17
17
  - alias
18
- - weakAlias
19
18
  - previousName
19
+ - weakAlias
20
+ - abbreviation
20
21
  - lastName
21
22
  - email
22
23
  - phone
@@ -14,6 +14,7 @@ PublicBody:
14
14
  caption:
15
15
  - name
16
16
  - alias
17
+ - abbreviation
17
18
  - weakAlias
18
19
  - previousName
19
20
  required:
@@ -0,0 +1,19 @@
1
+ import os
2
+ import requests
3
+ from typing import List
4
+ from rigour.env import env_opt, env_str
5
+
6
+
7
+ def get_env_list(name: str, default: List[str] = []) -> List[str]:
8
+ value = env_opt(name)
9
+ if value is not None:
10
+ values = value.split(":")
11
+ if len(values):
12
+ return values
13
+ return default
14
+
15
+
16
+ MODEL_PATH = os.path.join(os.path.dirname(__file__), "schema")
17
+ MODEL_PATH = env_str("FTM_MODEL_PATH", MODEL_PATH)
18
+
19
+ USER_AGENT = env_str("FTM_USER_AGENT", requests.utils.default_user_agent())
@@ -1,6 +1,6 @@
1
1
  from hashlib import sha1
2
2
  from collections.abc import Mapping
3
- from typing import Any, Dict, List, Optional, Set, Type
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type
4
4
  from typing import Generator, Iterable, Tuple, TypeVar
5
5
  from rigour.langs import LangStr
6
6
  from rigour.names.pick import pick_lang_name
@@ -10,17 +10,20 @@ from followthemoney.exc import InvalidData
10
10
  from followthemoney.schema import Schema
11
11
  from followthemoney.types.common import PropertyType
12
12
  from followthemoney.property import Property
13
- from followthemoney.util import gettext
13
+ from followthemoney.util import HASH_ENCODING, gettext
14
14
  from followthemoney.proxy import P
15
15
  from followthemoney.types import registry
16
16
  from followthemoney.value import string_list, Values
17
17
  from followthemoney.proxy import EntityProxy
18
- from followthemoney.dataset import Dataset, DefaultDataset
18
+ from followthemoney.dataset import Dataset, UndefinedDataset
19
19
  from followthemoney.statement.statement import Statement
20
20
  from followthemoney.statement.util import BASE_ID
21
21
 
22
22
  SE = TypeVar("SE", bound="StatementEntity")
23
23
 
24
+ if TYPE_CHECKING:
25
+ from hashlib import _Hash
26
+
24
27
 
25
28
  class StatementEntity(EntityProxy):
26
29
  """An entity object that can link to a set of datasets that it is sourced from."""
@@ -35,7 +38,12 @@ class StatementEntity(EntityProxy):
35
38
  "_statements",
36
39
  )
37
40
 
38
- def __init__(self, dataset: Dataset, data: Dict[str, Any], cleaned: bool = True):
41
+ def __init__(
42
+ self,
43
+ dataset: Dataset,
44
+ data: Dict[str, Any],
45
+ cleaned: bool = True,
46
+ ) -> None:
39
47
  data = dict(data or {})
40
48
  schema = Model.instance().get(data.pop("schema", None))
41
49
  if schema is None:
@@ -76,8 +84,7 @@ class StatementEntity(EntityProxy):
76
84
  for stmts in self._statements.values():
77
85
  for stmt in stmts:
78
86
  if stmt.entity_id is None and self.id is not None:
79
- stmt.entity_id = self.id
80
- stmt.id = stmt.generate_key()
87
+ stmt = stmt.clone(entity_id=self.id)
81
88
  if stmt.id is None:
82
89
  stmt.id = stmt.generate_key()
83
90
  yield stmt
@@ -97,9 +104,9 @@ class StatementEntity(EntityProxy):
97
104
  if stmt.first_seen is not None:
98
105
  first_seen.add(stmt.first_seen)
99
106
  if self.id is not None:
100
- digest = sha1(self.schema.name.encode("utf-8"))
107
+ digest = sha1(self.schema.name.encode(HASH_ENCODING))
101
108
  for id in sorted(ids):
102
- digest.update(id.encode("utf-8"))
109
+ digest.update(id.encode(HASH_ENCODING))
103
110
  checksum = digest.hexdigest()
104
111
  # This is to make the last_change value stable across
105
112
  # serialisation:
@@ -183,6 +190,11 @@ class StatementEntity(EntityProxy):
183
190
  return []
184
191
  return list(self._statements[prop_name])
185
192
 
193
+ @property
194
+ def has_statements(self) -> bool:
195
+ """Return whether the entity has any statements."""
196
+ return len(self._statements) > 0
197
+
186
198
  def set(
187
199
  self,
188
200
  prop: P,
@@ -426,7 +438,7 @@ class StatementEntity(EntityProxy):
426
438
  origins.add(stmt.origin)
427
439
 
428
440
  data["referents"] = list(referents)
429
- data["datasets"] = list(datasets)
441
+ data["datasets"] = [d for d in datasets if d != Dataset.UNDEFINED]
430
442
  if origins:
431
443
  data["origin"] = list(origins)
432
444
 
@@ -449,6 +461,23 @@ class StatementEntity(EntityProxy):
449
461
  data["statements"] = [stmt.to_dict() for stmt in self.statements]
450
462
  return data
451
463
 
464
+ def _checksum_digest(self) -> "_Hash":
465
+ """Create a SHA1 digest of the entity's ID, schema and properties for
466
+ change detection. This is returned as a hashlib digest object so that
467
+ it can be subclassed."""
468
+ digest = sha1()
469
+ if self.id is not None:
470
+ digest.update(self.id.encode(HASH_ENCODING))
471
+ statement_ids: List[str] = []
472
+ for stmts in self._statements.values():
473
+ for stmt in stmts:
474
+ if stmt.id is not None:
475
+ statement_ids.append(stmt.id)
476
+ for stmt_id in sorted(statement_ids):
477
+ digest.update(stmt_id.encode(HASH_ENCODING))
478
+ digest.update(b"\x1e")
479
+ return digest
480
+
452
481
  def __len__(self) -> int:
453
482
  return len(list(self._iter_stmt())) + 1
454
483
 
@@ -460,7 +489,7 @@ class StatementEntity(EntityProxy):
460
489
  default_dataset: Optional[Dataset] = None,
461
490
  ) -> SE:
462
491
  # Exists only for backwards compatibility.
463
- dataset = default_dataset or DefaultDataset
492
+ dataset = default_dataset or UndefinedDataset
464
493
  return cls(dataset, data, cleaned=cleaned)
465
494
 
466
495
  @classmethod