followthemoney 4.3.0__py3-none-any.whl → 4.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +1 -1
- followthemoney/compare.py +6 -0
- followthemoney/dataset/dataset.py +18 -0
- followthemoney/entity.py +29 -15
- followthemoney/mapping/csv.py +3 -1
- followthemoney/model.py +6 -5
- followthemoney/property.py +23 -4
- followthemoney/proxy.py +32 -11
- followthemoney/schema/Company.yaml +5 -0
- followthemoney/schema/CryptoWallet.yaml +4 -0
- followthemoney/schema/Image.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +10 -0
- followthemoney/schema/Organization.yaml +5 -0
- followthemoney/schema/Person.yaml +4 -0
- followthemoney/schema/PublicBody.yaml +4 -0
- followthemoney/schema/Thing.yaml +3 -2
- followthemoney/schema.py +16 -2
- followthemoney/settings.py +19 -0
- followthemoney/statement/entity.py +31 -7
- followthemoney/statement/serialize.py +18 -13
- followthemoney/statement/statement.py +151 -42
- followthemoney/statement/util.py +23 -2
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -3
- followthemoney/types/country.py +19 -4
- followthemoney/types/date.py +13 -3
- followthemoney/types/entity.py +3 -3
- followthemoney/types/gender.py +6 -6
- followthemoney/types/identifier.py +8 -8
- followthemoney/types/ip.py +3 -3
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -3
- followthemoney/types/mimetype.py +3 -3
- followthemoney/types/name.py +3 -3
- followthemoney/types/number.py +2 -2
- followthemoney/types/phone.py +3 -3
- followthemoney/types/string.py +2 -2
- followthemoney/types/topic.py +6 -3
- followthemoney/types/url.py +3 -3
- followthemoney/util.py +6 -14
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/METADATA +3 -3
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/RECORD +45 -44
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/WHEEL +1 -1
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/entry_points.txt +0 -0
- {followthemoney-4.3.0.dist-info → followthemoney-4.5.0.dist-info}/licenses/LICENSE +0 -0
followthemoney/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ from followthemoney.statement import Statement, StatementEntity, SE
|
|
|
9
9
|
from followthemoney.dataset import Dataset, DefaultDataset, DS
|
|
10
10
|
from followthemoney.util import set_model_locale
|
|
11
11
|
|
|
12
|
-
__version__ = "4.
|
|
12
|
+
__version__ = "4.5.0"
|
|
13
13
|
|
|
14
14
|
# Data model singleton
|
|
15
15
|
model = Model.instance()
|
followthemoney/compare.py
CHANGED
|
@@ -77,6 +77,12 @@ def compare(
|
|
|
77
77
|
weights: Weights = COMPARE_WEIGHTS,
|
|
78
78
|
) -> float:
|
|
79
79
|
"""Compare two entities and return a match score."""
|
|
80
|
+
if left.checksum == right.checksum:
|
|
81
|
+
# Check if there is any data at all (ie any basis for making a decision),
|
|
82
|
+
# if so, return a perfect match. This avoids marking two empty entities
|
|
83
|
+
# as matching. Bit ambiguous, but practical.
|
|
84
|
+
if len(left.properties) > 0 and len(right.properties) > 0:
|
|
85
|
+
return 1.0
|
|
80
86
|
scores = compare_scores(left, right)
|
|
81
87
|
return _compare(scores, weights)
|
|
82
88
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
import yaml
|
|
2
3
|
import logging
|
|
3
4
|
from functools import cached_property
|
|
@@ -38,6 +39,8 @@ class DatasetModel(BaseModel):
|
|
|
38
39
|
coverage: DataCoverage | None = None
|
|
39
40
|
resources: List[DataResource] = []
|
|
40
41
|
children: Set[str] = set()
|
|
42
|
+
deprecation: Optional[str] = None
|
|
43
|
+
deprecated: bool = False
|
|
41
44
|
|
|
42
45
|
@field_validator("name", mode="after")
|
|
43
46
|
@classmethod
|
|
@@ -57,6 +60,18 @@ class DatasetModel(BaseModel):
|
|
|
57
60
|
data["children"] = children
|
|
58
61
|
return data
|
|
59
62
|
|
|
63
|
+
@model_validator(mode="after")
|
|
64
|
+
def evaluate_data(self) -> "DatasetModel":
|
|
65
|
+
# derive deprecated from deprecation notice:
|
|
66
|
+
if self.deprecation is not None:
|
|
67
|
+
self.deprecation = self.deprecation.strip()
|
|
68
|
+
if not len(self.deprecation):
|
|
69
|
+
self.deprecation = None
|
|
70
|
+
self.deprecated = self.deprecation is not None or self.deprecated
|
|
71
|
+
if self.deprecated and (self.coverage is None or self.coverage.end is None):
|
|
72
|
+
raise ValueError("Deprecated dataset coverage must have an end date.")
|
|
73
|
+
return self
|
|
74
|
+
|
|
60
75
|
def get_resource(self, name: str) -> DataResource:
|
|
61
76
|
for res in self.resources:
|
|
62
77
|
if res.name == name:
|
|
@@ -121,10 +136,13 @@ class Dataset:
|
|
|
121
136
|
) -> DS:
|
|
122
137
|
from followthemoney.dataset.catalog import DataCatalog
|
|
123
138
|
|
|
139
|
+
path = Path(path)
|
|
124
140
|
with open(path, "r") as fh:
|
|
125
141
|
data = yaml.safe_load(fh)
|
|
126
142
|
if catalog is None:
|
|
127
143
|
catalog = DataCatalog(cls, {})
|
|
144
|
+
if "name" not in data:
|
|
145
|
+
data["name"] = path.stem
|
|
128
146
|
return catalog.make_dataset(data)
|
|
129
147
|
|
|
130
148
|
@classmethod
|
followthemoney/entity.py
CHANGED
|
@@ -5,6 +5,7 @@ from rigour.names import pick_name
|
|
|
5
5
|
from followthemoney.proxy import EntityProxy
|
|
6
6
|
from followthemoney.schema import Schema
|
|
7
7
|
from followthemoney.statement import BASE_ID, Statement
|
|
8
|
+
from followthemoney.util import HASH_ENCODING
|
|
8
9
|
|
|
9
10
|
VE = TypeVar("VE", bound="ValueEntity")
|
|
10
11
|
|
|
@@ -42,25 +43,28 @@ class ValueEntity(EntityProxy):
|
|
|
42
43
|
key_prefix: Optional[str] = None,
|
|
43
44
|
cleaned: bool = True,
|
|
44
45
|
):
|
|
46
|
+
self._caption: Optional[str] = data.pop("caption", None)
|
|
47
|
+
self.datasets: Set[str] = set(data.pop("datasets", []))
|
|
48
|
+
self.referents: Set[str] = set(data.pop("referents", []))
|
|
49
|
+
self.first_seen: Optional[str] = data.pop("first_seen", None)
|
|
50
|
+
self.last_seen: Optional[str] = data.pop("last_seen", None)
|
|
51
|
+
self.last_change: Optional[str] = data.pop("last_change", None)
|
|
45
52
|
super().__init__(schema, data, key_prefix=key_prefix, cleaned=cleaned)
|
|
46
|
-
self._caption: Optional[str] = data.get("caption")
|
|
47
|
-
self.datasets: Set[str] = set(data.get("datasets", []))
|
|
48
|
-
self.referents: Set[str] = set(data.get("referents", []))
|
|
49
|
-
self.first_seen: Optional[str] = data.get("first_seen")
|
|
50
|
-
self.last_seen: Optional[str] = data.get("last_seen")
|
|
51
|
-
self.last_change: Optional[str] = data.get("last_change")
|
|
52
53
|
|
|
53
54
|
# add data from statement dict if present.
|
|
54
55
|
# this updates the dataset and referents set
|
|
55
56
|
for stmt_data in data.pop("statements", []):
|
|
56
57
|
stmt = Statement.from_dict(stmt_data)
|
|
58
|
+
prop = schema.get(stmt.prop)
|
|
59
|
+
if prop is None:
|
|
60
|
+
continue
|
|
57
61
|
self.datasets.add(stmt.dataset)
|
|
58
62
|
if stmt.schema != self.schema.name:
|
|
59
63
|
self.schema = schema.model.common_schema(self.schema, stmt.schema)
|
|
60
64
|
if stmt.entity_id != self.id:
|
|
61
65
|
self.referents.add(stmt.entity_id)
|
|
62
66
|
if stmt.prop != BASE_ID:
|
|
63
|
-
self.
|
|
67
|
+
self.unsafe_add(prop, stmt.value, cleaned=cleaned)
|
|
64
68
|
|
|
65
69
|
def merge(self: VE, other: EntityProxy) -> VE:
|
|
66
70
|
merged = super().merge(other)
|
|
@@ -78,15 +82,25 @@ class ValueEntity(EntityProxy):
|
|
|
78
82
|
merged.last_change = max(changed, default=None)
|
|
79
83
|
return merged
|
|
80
84
|
|
|
85
|
+
@property
|
|
86
|
+
def checksum(self) -> str:
|
|
87
|
+
digest = self._checksum_digest()
|
|
88
|
+
for dataset in sorted(self.datasets):
|
|
89
|
+
digest.update(dataset.encode(HASH_ENCODING))
|
|
90
|
+
digest.update(b"\x1e")
|
|
91
|
+
for referent in sorted(self.referents):
|
|
92
|
+
digest.update(referent.encode(HASH_ENCODING))
|
|
93
|
+
digest.update(b"\x1e")
|
|
94
|
+
if self.last_change is not None:
|
|
95
|
+
digest.update(self.last_change.encode(HASH_ENCODING))
|
|
96
|
+
return digest.hexdigest()
|
|
97
|
+
|
|
81
98
|
def to_dict(self) -> Dict[str, Any]:
|
|
82
|
-
data
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"
|
|
87
|
-
"referents": list(self.referents),
|
|
88
|
-
"datasets": list(self.datasets),
|
|
89
|
-
}
|
|
99
|
+
data = super().to_dict()
|
|
100
|
+
data["referents"] = list(self.referents)
|
|
101
|
+
data["datasets"] = list(self.datasets)
|
|
102
|
+
if self._caption is not None:
|
|
103
|
+
data["caption"] = self._caption
|
|
90
104
|
if self.first_seen is not None:
|
|
91
105
|
data["first_seen"] = self.first_seen
|
|
92
106
|
if self.last_seen is not None:
|
followthemoney/mapping/csv.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, cast
|
|
|
9
9
|
from typing import Any, Dict, Generator, ItemsView, Iterable, List, Optional, Set, Tuple
|
|
10
10
|
|
|
11
11
|
from followthemoney.mapping.source import Record, Source
|
|
12
|
+
from followthemoney.settings import USER_AGENT
|
|
12
13
|
from followthemoney.util import sanitize_text
|
|
13
14
|
from followthemoney.exc import InvalidMapping
|
|
14
15
|
|
|
@@ -64,7 +65,8 @@ class CSVSource(Source):
|
|
|
64
65
|
parsed_url = urlparse(url)
|
|
65
66
|
log.info("Loading: %s", url)
|
|
66
67
|
if parsed_url.scheme in ["http", "https"]:
|
|
67
|
-
|
|
68
|
+
headers = {"User-Agent": USER_AGENT}
|
|
69
|
+
res = requests.get(url, stream=True, headers=headers)
|
|
68
70
|
if not res.ok:
|
|
69
71
|
raise InvalidMapping("Failed to open CSV: %s" % url)
|
|
70
72
|
# if res.encoding is None:
|
followthemoney/model.py
CHANGED
|
@@ -3,12 +3,15 @@ import yaml
|
|
|
3
3
|
from functools import cache
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
from typing import Dict, Generator, Iterator, Optional, Set, TypedDict, Union
|
|
6
|
+
from rigour.env import ENCODING
|
|
6
7
|
|
|
7
8
|
from followthemoney.types import registry
|
|
8
9
|
from followthemoney.types.common import PropertyType, PropertyTypeToDict
|
|
9
10
|
from followthemoney.schema import Schema, SchemaToDict
|
|
10
11
|
from followthemoney.property import Property
|
|
11
12
|
from followthemoney.exc import InvalidModel, InvalidData
|
|
13
|
+
from followthemoney.settings import MODEL_PATH
|
|
14
|
+
from followthemoney.util import const
|
|
12
15
|
|
|
13
16
|
if TYPE_CHECKING:
|
|
14
17
|
from followthemoney.proxy import EntityProxy
|
|
@@ -46,10 +49,7 @@ class Model(object):
|
|
|
46
49
|
@classmethod
|
|
47
50
|
def instance(cls) -> "Model":
|
|
48
51
|
if cls._instance is None:
|
|
49
|
-
|
|
50
|
-
model_path = os.path.join(model_path, "schema")
|
|
51
|
-
model_path = os.environ.get("FTM_MODEL_PATH", model_path)
|
|
52
|
-
cls._instance = cls(model_path)
|
|
52
|
+
cls._instance = cls(MODEL_PATH)
|
|
53
53
|
return cls._instance
|
|
54
54
|
|
|
55
55
|
def generate(self) -> None:
|
|
@@ -67,11 +67,12 @@ class Model(object):
|
|
|
67
67
|
schema.properties[prop.name] = prop
|
|
68
68
|
|
|
69
69
|
def _load(self, filepath: str) -> None:
|
|
70
|
-
with open(filepath, "r", encoding=
|
|
70
|
+
with open(filepath, "r", encoding=ENCODING) as fh:
|
|
71
71
|
data = yaml.safe_load(fh)
|
|
72
72
|
if not isinstance(data, dict):
|
|
73
73
|
raise InvalidModel("Model file is not a mapping: %s" % filepath)
|
|
74
74
|
for name, config in data.items():
|
|
75
|
+
name = const(name)
|
|
75
76
|
self.schemata[name] = Schema(self, name, config)
|
|
76
77
|
|
|
77
78
|
def get(self, name: Union[str, Schema]) -> Optional[Schema]:
|
followthemoney/property.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from banal import is_mapping, as_bool
|
|
3
|
+
from rigour.ids import get_identifier_format
|
|
3
4
|
from typing import TYPE_CHECKING, Any, List, Optional, TypedDict
|
|
4
5
|
|
|
5
|
-
from followthemoney.exc import InvalidModel
|
|
6
|
+
from followthemoney.exc import InvalidData, InvalidModel
|
|
6
7
|
from followthemoney.types import registry
|
|
7
8
|
from followthemoney.util import gettext, get_entity_id, const
|
|
8
9
|
|
|
@@ -86,17 +87,16 @@ class Property:
|
|
|
86
87
|
self.schema = schema
|
|
87
88
|
|
|
88
89
|
#: Machine-readable name for this property.
|
|
89
|
-
self.name =
|
|
90
|
+
self.name = name
|
|
90
91
|
if not check_property_name(self.name):
|
|
91
92
|
raise InvalidModel("Invalid name: %s" % self.name)
|
|
92
93
|
|
|
93
94
|
#: Qualified property name, which also includes the schema name.
|
|
94
95
|
self.qname = const("%s:%s" % (schema.name, self.name))
|
|
95
96
|
|
|
96
|
-
self._hash = hash("<Property(%r)>" % self.qname)
|
|
97
|
-
|
|
98
97
|
self._label = data.get("label", name)
|
|
99
98
|
self._description = data.get("description")
|
|
99
|
+
self._hash = hash("<Property(%r)>" % self.qname)
|
|
100
100
|
|
|
101
101
|
#: This property is deprecated and should not be used.
|
|
102
102
|
self.deprecated = as_bool(data.get("deprecated", False))
|
|
@@ -157,6 +157,13 @@ class Property:
|
|
|
157
157
|
raise InvalidModel("Invalid reverse: %s" % self)
|
|
158
158
|
self.reverse = self.range._add_reverse(model, self._reverse, self)
|
|
159
159
|
|
|
160
|
+
if self.type == registry.identifier and self.format is not None:
|
|
161
|
+
format_ = get_identifier_format(self.format)
|
|
162
|
+
if format_ is None or format_.NAME != self.format:
|
|
163
|
+
raise InvalidModel("Invalid identifier format: %s" % self.format)
|
|
164
|
+
# Internalize the string:
|
|
165
|
+
self.format = format_.NAME
|
|
166
|
+
|
|
160
167
|
@property
|
|
161
168
|
def label(self) -> str:
|
|
162
169
|
"""User-facing title for this property."""
|
|
@@ -229,6 +236,18 @@ class Property:
|
|
|
229
236
|
data["format"] = self.format
|
|
230
237
|
return data
|
|
231
238
|
|
|
239
|
+
def __reduce__(self) -> Any:
|
|
240
|
+
return (self._reconstruct, (self.qname,))
|
|
241
|
+
|
|
242
|
+
@classmethod
|
|
243
|
+
def _reconstruct(cls, qname: str) -> "Property":
|
|
244
|
+
from followthemoney.model import Model
|
|
245
|
+
|
|
246
|
+
prop = Model.instance().get_qname(qname)
|
|
247
|
+
if prop is None:
|
|
248
|
+
raise InvalidData("Unknown property: %r" % qname)
|
|
249
|
+
return prop
|
|
250
|
+
|
|
232
251
|
def __repr__(self) -> str:
|
|
233
252
|
return "<Property(%r)>" % self.qname
|
|
234
253
|
|
followthemoney/proxy.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import logging
|
|
2
3
|
from typing import TYPE_CHECKING, cast, Any
|
|
3
4
|
from typing import Dict, Generator, List, Optional, Set, Tuple, Union, Type, TypeVar
|
|
@@ -10,13 +11,14 @@ from followthemoney.types import registry
|
|
|
10
11
|
from followthemoney.types.common import PropertyType
|
|
11
12
|
from followthemoney.property import Property
|
|
12
13
|
from followthemoney.value import string_list, Values
|
|
13
|
-
from followthemoney.util import sanitize_text, gettext
|
|
14
|
+
from followthemoney.util import HASH_ENCODING, sanitize_text, gettext
|
|
14
15
|
from followthemoney.util import merge_context, make_entity_id
|
|
15
16
|
from followthemoney.model import Model
|
|
16
17
|
from followthemoney.schema import Schema
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
19
20
|
from followthemoney.model import Model
|
|
21
|
+
from hashlib import _Hash
|
|
20
22
|
|
|
21
23
|
log = logging.getLogger(__name__)
|
|
22
24
|
P = Union[Property, str]
|
|
@@ -403,13 +405,10 @@ class EntityProxy(object):
|
|
|
403
405
|
schema and any contextual values that were handed in initially. The resulting
|
|
404
406
|
dictionary can be used to make a new proxy, and it is commonly written to disk
|
|
405
407
|
or a database."""
|
|
406
|
-
data = dict(self.context)
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
"properties": self.properties,
|
|
411
|
-
}
|
|
412
|
-
data.update(extra)
|
|
408
|
+
data: Dict[str, Any] = dict(self.context)
|
|
409
|
+
data["id"] = self.id
|
|
410
|
+
data["schema"] = self.schema.name
|
|
411
|
+
data["properties"] = self.properties
|
|
413
412
|
return data
|
|
414
413
|
|
|
415
414
|
def to_full_dict(self, matchable: bool = False) -> Dict[str, Any]:
|
|
@@ -440,6 +439,28 @@ class EntityProxy(object):
|
|
|
440
439
|
self.add(prop, values, cleaned=True, quiet=True)
|
|
441
440
|
return self
|
|
442
441
|
|
|
442
|
+
def _checksum_digest(self) -> "_Hash":
|
|
443
|
+
"""Create a SHA1 digest of the entity's ID, schema and properties for
|
|
444
|
+
change detection. This is returned as a hashlib digest object so that
|
|
445
|
+
it can be subclassed."""
|
|
446
|
+
digest = hashlib.sha1()
|
|
447
|
+
if self.id is not None:
|
|
448
|
+
digest.update(self.id.encode(HASH_ENCODING))
|
|
449
|
+
digest.update(self.schema.name.encode(HASH_ENCODING))
|
|
450
|
+
for prop in sorted(self._properties.keys()):
|
|
451
|
+
digest.update(prop.encode(HASH_ENCODING))
|
|
452
|
+
for value in sorted(self._properties[prop]):
|
|
453
|
+
digest.update(value.encode(HASH_ENCODING))
|
|
454
|
+
digest.update(b"\x1e")
|
|
455
|
+
digest.update(b"\x1f")
|
|
456
|
+
return digest
|
|
457
|
+
|
|
458
|
+
@property
|
|
459
|
+
def checksum(self) -> str:
|
|
460
|
+
"""A SHA1 checksum hexdigest representing the current state of the
|
|
461
|
+
entity proxy. This can be used for change detection."""
|
|
462
|
+
return self._checksum_digest().hexdigest()
|
|
463
|
+
|
|
443
464
|
def __getstate__(self) -> Dict[str, Any]:
|
|
444
465
|
data = {slot: getattr(self, slot) for slot in self.__slots__}
|
|
445
466
|
data["schema"] = self.schema.name
|
|
@@ -462,14 +483,14 @@ class EntityProxy(object):
|
|
|
462
483
|
return self._size
|
|
463
484
|
|
|
464
485
|
def __hash__(self) -> int:
|
|
465
|
-
if
|
|
466
|
-
raise RuntimeError("
|
|
486
|
+
if self.id is None:
|
|
487
|
+
raise RuntimeError("Unhashable entity proxy without ID.")
|
|
467
488
|
return hash(self.id)
|
|
468
489
|
|
|
469
490
|
def __eq__(self, other: Any) -> bool:
|
|
470
491
|
try:
|
|
471
492
|
if self.id is None or other.id is None:
|
|
472
|
-
raise RuntimeError("Cannot compare
|
|
493
|
+
raise RuntimeError("Cannot compare entity proxies without IDs.")
|
|
473
494
|
return bool(self.id == other.id)
|
|
474
495
|
except AttributeError:
|
|
475
496
|
return False
|
followthemoney/schema/Image.yaml
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
Image:
|
|
2
|
+
# This schema defines an image file entity within the FollowTheMoney data model.
|
|
3
|
+
# If a `checksum` property is present, consider loading it from an Aleph archive
|
|
4
|
+
# or FtM data lake. Otherwise, use `sourceUrl` to fetch the image directly.
|
|
2
5
|
extends:
|
|
3
6
|
- Document
|
|
4
7
|
label: Image
|
|
@@ -23,3 +26,7 @@ Image:
|
|
|
23
26
|
label: "Images"
|
|
24
27
|
type: entity
|
|
25
28
|
range: Person
|
|
29
|
+
credit:
|
|
30
|
+
label: "Credit"
|
|
31
|
+
description: "The credit or attribution for the image."
|
|
32
|
+
type: string
|
|
@@ -17,6 +17,10 @@ LegalEntity:
|
|
|
17
17
|
- name
|
|
18
18
|
caption:
|
|
19
19
|
- name
|
|
20
|
+
- alias
|
|
21
|
+
- abbreviation
|
|
22
|
+
- weakAlias
|
|
23
|
+
- previousName
|
|
20
24
|
- email
|
|
21
25
|
- phone
|
|
22
26
|
- registrationNumber
|
|
@@ -26,6 +30,12 @@ LegalEntity:
|
|
|
26
30
|
end:
|
|
27
31
|
- dissolutionDate
|
|
28
32
|
properties:
|
|
33
|
+
abbreviation:
|
|
34
|
+
label: Abbreviation
|
|
35
|
+
type: name
|
|
36
|
+
description: "Abbreviated name or acronym"
|
|
37
|
+
# TODO: is un-matchable wise? The idea is to handle it like `weakAlias` rather than `alias`.
|
|
38
|
+
matchable: false
|
|
29
39
|
email:
|
|
30
40
|
label: E-Mail
|
|
31
41
|
type: email
|
followthemoney/schema/Thing.yaml
CHANGED
|
@@ -24,7 +24,7 @@ Thing:
|
|
|
24
24
|
label: Country
|
|
25
25
|
type: country
|
|
26
26
|
alias:
|
|
27
|
-
label:
|
|
27
|
+
label: Alias
|
|
28
28
|
type: name
|
|
29
29
|
previousName:
|
|
30
30
|
label: Previous name
|
|
@@ -32,6 +32,7 @@ Thing:
|
|
|
32
32
|
weakAlias:
|
|
33
33
|
label: Weak alias
|
|
34
34
|
type: name
|
|
35
|
+
description: "A relatively broad or generic alias that should not be used for matching in screening systems. It may still may be useful for identification purposes, particularly in confirming a possible match triggered by other identifier information."
|
|
35
36
|
matchable: false
|
|
36
37
|
sourceUrl:
|
|
37
38
|
label: Source link
|
|
@@ -55,7 +56,7 @@ Thing:
|
|
|
55
56
|
wikidataId:
|
|
56
57
|
label: Wikidata ID
|
|
57
58
|
type: identifier
|
|
58
|
-
format:
|
|
59
|
+
format: wikidata
|
|
59
60
|
maxLength: 32
|
|
60
61
|
keywords:
|
|
61
62
|
label: Keywords
|
followthemoney/schema.py
CHANGED
|
@@ -106,7 +106,7 @@ class Schema:
|
|
|
106
106
|
|
|
107
107
|
def __init__(self, model: "Model", name: str, data: SchemaSpec) -> None:
|
|
108
108
|
#: Machine-readable name of the schema, used for identification.
|
|
109
|
-
self.name =
|
|
109
|
+
self.name = name
|
|
110
110
|
self.model = model
|
|
111
111
|
self._label = data.get("label", name)
|
|
112
112
|
self._plural = data.get("plural", self.label)
|
|
@@ -191,6 +191,7 @@ class Schema:
|
|
|
191
191
|
#: inherited from parent schemata.
|
|
192
192
|
self.properties: Dict[str, Property] = {}
|
|
193
193
|
for pname, prop in data.get("properties", {}).items():
|
|
194
|
+
pname = const(pname)
|
|
194
195
|
self.properties[pname] = Property(self, pname, prop)
|
|
195
196
|
|
|
196
197
|
def generate(self, model: "Model") -> None:
|
|
@@ -264,6 +265,7 @@ class Schema:
|
|
|
264
265
|
name = data.get("name")
|
|
265
266
|
if name is None:
|
|
266
267
|
raise InvalidModel("Unnamed reverse: %s" % other)
|
|
268
|
+
name = const(name)
|
|
267
269
|
|
|
268
270
|
prop = self.get(name)
|
|
269
271
|
if prop is None:
|
|
@@ -272,7 +274,7 @@ class Schema:
|
|
|
272
274
|
"type": registry.entity.name,
|
|
273
275
|
"reverse": {"name": other.name},
|
|
274
276
|
"range": other.schema.name,
|
|
275
|
-
"hidden": data.get("hidden", other.hidden),
|
|
277
|
+
"hidden": as_bool(data.get("hidden", other.hidden)),
|
|
276
278
|
}
|
|
277
279
|
prop = Property(self, name, spec)
|
|
278
280
|
prop.stub = True
|
|
@@ -466,6 +468,18 @@ class Schema:
|
|
|
466
468
|
data["properties"] = properties
|
|
467
469
|
return data
|
|
468
470
|
|
|
471
|
+
def __reduce__(self) -> Any:
|
|
472
|
+
return (self._reconstruct, (self.name,))
|
|
473
|
+
|
|
474
|
+
@classmethod
|
|
475
|
+
def _reconstruct(cls, name: str) -> "Schema":
|
|
476
|
+
from followthemoney.model import Model
|
|
477
|
+
|
|
478
|
+
schema = Model.instance().get(name)
|
|
479
|
+
if schema is None:
|
|
480
|
+
raise InvalidData("Unknown schema: %r" % name)
|
|
481
|
+
return schema
|
|
482
|
+
|
|
469
483
|
def __eq__(self, other: Any) -> bool:
|
|
470
484
|
"""Compare two schemata (via hash)."""
|
|
471
485
|
try:
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from typing import List
|
|
4
|
+
from rigour.env import env_opt, env_str
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_env_list(name: str, default: List[str] = []) -> List[str]:
|
|
8
|
+
value = env_opt(name)
|
|
9
|
+
if value is not None:
|
|
10
|
+
values = value.split(":")
|
|
11
|
+
if len(values):
|
|
12
|
+
return values
|
|
13
|
+
return default
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
MODEL_PATH = os.path.join(os.path.dirname(__file__), "schema")
|
|
17
|
+
MODEL_PATH = env_str("FTM_MODEL_PATH", MODEL_PATH)
|
|
18
|
+
|
|
19
|
+
USER_AGENT = env_str("FTM_USER_AGENT", requests.utils.default_user_agent())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from hashlib import sha1
|
|
2
2
|
from collections.abc import Mapping
|
|
3
|
-
from typing import Any, Dict, List, Optional, Set, Type
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type
|
|
4
4
|
from typing import Generator, Iterable, Tuple, TypeVar
|
|
5
5
|
from rigour.langs import LangStr
|
|
6
6
|
from rigour.names.pick import pick_lang_name
|
|
@@ -10,7 +10,7 @@ from followthemoney.exc import InvalidData
|
|
|
10
10
|
from followthemoney.schema import Schema
|
|
11
11
|
from followthemoney.types.common import PropertyType
|
|
12
12
|
from followthemoney.property import Property
|
|
13
|
-
from followthemoney.util import gettext
|
|
13
|
+
from followthemoney.util import HASH_ENCODING, gettext
|
|
14
14
|
from followthemoney.proxy import P
|
|
15
15
|
from followthemoney.types import registry
|
|
16
16
|
from followthemoney.value import string_list, Values
|
|
@@ -21,6 +21,9 @@ from followthemoney.statement.util import BASE_ID
|
|
|
21
21
|
|
|
22
22
|
SE = TypeVar("SE", bound="StatementEntity")
|
|
23
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from hashlib import _Hash
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
class StatementEntity(EntityProxy):
|
|
26
29
|
"""An entity object that can link to a set of datasets that it is sourced from."""
|
|
@@ -35,7 +38,12 @@ class StatementEntity(EntityProxy):
|
|
|
35
38
|
"_statements",
|
|
36
39
|
)
|
|
37
40
|
|
|
38
|
-
def __init__(
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
dataset: Dataset,
|
|
44
|
+
data: Dict[str, Any],
|
|
45
|
+
cleaned: bool = True,
|
|
46
|
+
) -> None:
|
|
39
47
|
data = dict(data or {})
|
|
40
48
|
schema = Model.instance().get(data.pop("schema", None))
|
|
41
49
|
if schema is None:
|
|
@@ -76,8 +84,7 @@ class StatementEntity(EntityProxy):
|
|
|
76
84
|
for stmts in self._statements.values():
|
|
77
85
|
for stmt in stmts:
|
|
78
86
|
if stmt.entity_id is None and self.id is not None:
|
|
79
|
-
stmt.entity_id
|
|
80
|
-
stmt.id = stmt.generate_key()
|
|
87
|
+
stmt = stmt.clone(entity_id=self.id)
|
|
81
88
|
if stmt.id is None:
|
|
82
89
|
stmt.id = stmt.generate_key()
|
|
83
90
|
yield stmt
|
|
@@ -97,9 +104,9 @@ class StatementEntity(EntityProxy):
|
|
|
97
104
|
if stmt.first_seen is not None:
|
|
98
105
|
first_seen.add(stmt.first_seen)
|
|
99
106
|
if self.id is not None:
|
|
100
|
-
digest = sha1(self.schema.name.encode(
|
|
107
|
+
digest = sha1(self.schema.name.encode(HASH_ENCODING))
|
|
101
108
|
for id in sorted(ids):
|
|
102
|
-
digest.update(id.encode(
|
|
109
|
+
digest.update(id.encode(HASH_ENCODING))
|
|
103
110
|
checksum = digest.hexdigest()
|
|
104
111
|
# This is to make the last_change value stable across
|
|
105
112
|
# serialisation:
|
|
@@ -449,6 +456,23 @@ class StatementEntity(EntityProxy):
|
|
|
449
456
|
data["statements"] = [stmt.to_dict() for stmt in self.statements]
|
|
450
457
|
return data
|
|
451
458
|
|
|
459
|
+
def _checksum_digest(self) -> "_Hash":
|
|
460
|
+
"""Create a SHA1 digest of the entity's ID, schema and properties for
|
|
461
|
+
change detection. This is returned as a hashlib digest object so that
|
|
462
|
+
it can be subclassed."""
|
|
463
|
+
digest = sha1()
|
|
464
|
+
if self.id is not None:
|
|
465
|
+
digest.update(self.id.encode(HASH_ENCODING))
|
|
466
|
+
statement_ids: List[str] = []
|
|
467
|
+
for stmts in self._statements.values():
|
|
468
|
+
for stmt in stmts:
|
|
469
|
+
if stmt.id is not None:
|
|
470
|
+
statement_ids.append(stmt.id)
|
|
471
|
+
for stmt_id in sorted(statement_ids):
|
|
472
|
+
digest.update(stmt_id.encode(HASH_ENCODING))
|
|
473
|
+
digest.update(b"\x1e")
|
|
474
|
+
return digest
|
|
475
|
+
|
|
452
476
|
def __len__(self) -> int:
|
|
453
477
|
return len(list(self._iter_stmt())) + 1
|
|
454
478
|
|