followthemoney 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -4
- followthemoney/cli/statement.py +13 -7
- followthemoney/cli/util.py +3 -3
- followthemoney/compare.py +6 -19
- followthemoney/dataset/__init__.py +2 -2
- followthemoney/dataset/dataset.py +20 -0
- followthemoney/entity.py +14 -0
- followthemoney/mapping/csv.py +3 -1
- followthemoney/model.py +4 -5
- followthemoney/proxy.py +27 -3
- followthemoney/schema/Company.yaml +1 -0
- followthemoney/schema/CryptoWallet.yaml +4 -0
- followthemoney/schema/Image.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +7 -0
- followthemoney/schema/Organization.yaml +1 -0
- followthemoney/schema/Person.yaml +2 -1
- followthemoney/schema/PublicBody.yaml +1 -0
- followthemoney/settings.py +19 -0
- followthemoney/statement/entity.py +39 -10
- followthemoney/statement/serialize.py +23 -14
- followthemoney/statement/statement.py +151 -42
- followthemoney/statement/util.py +21 -0
- followthemoney/types/country.py +16 -1
- followthemoney/types/date.py +10 -0
- followthemoney/types/language.py +1 -1
- followthemoney/util.py +6 -14
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/METADATA +3 -3
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/RECORD +31 -30
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/WHEEL +0 -0
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/entry_points.txt +0 -0
- {followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/licenses/LICENSE +0 -0
followthemoney/__init__.py
CHANGED
|
@@ -2,14 +2,14 @@ from followthemoney.entity import ValueEntity, VE
|
|
|
2
2
|
from followthemoney.model import Model
|
|
3
3
|
from followthemoney.schema import Schema
|
|
4
4
|
from followthemoney.property import Property
|
|
5
|
-
from followthemoney.types import registry
|
|
5
|
+
from followthemoney.types import registry, PropertyType
|
|
6
6
|
from followthemoney.value import Value, Values
|
|
7
7
|
from followthemoney.proxy import EntityProxy, E
|
|
8
8
|
from followthemoney.statement import Statement, StatementEntity, SE
|
|
9
|
-
from followthemoney.dataset import Dataset,
|
|
9
|
+
from followthemoney.dataset import Dataset, UndefinedDataset, DS
|
|
10
10
|
from followthemoney.util import set_model_locale
|
|
11
11
|
|
|
12
|
-
__version__ = "4.
|
|
12
|
+
__version__ = "4.5.1"
|
|
13
13
|
|
|
14
14
|
# Data model singleton
|
|
15
15
|
model = Model.instance()
|
|
@@ -20,13 +20,14 @@ __all__ = [
|
|
|
20
20
|
"Model",
|
|
21
21
|
"Schema",
|
|
22
22
|
"Property",
|
|
23
|
+
"PropertyType",
|
|
23
24
|
"Value",
|
|
24
25
|
"Values",
|
|
25
26
|
"EntityProxy",
|
|
26
27
|
"E",
|
|
27
28
|
"registry",
|
|
28
29
|
"Dataset",
|
|
29
|
-
"
|
|
30
|
+
"UndefinedDataset",
|
|
30
31
|
"DS",
|
|
31
32
|
"Statement",
|
|
32
33
|
"StatementEntity",
|
followthemoney/cli/statement.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import click
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Generator, List
|
|
3
|
+
from typing import Generator, List, Optional
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
from followthemoney.cli.cli import cli
|
|
7
7
|
from followthemoney.cli.util import InPath, OutPath
|
|
8
8
|
from followthemoney.cli.util import path_entities, write_entity, path_writer
|
|
9
|
-
from followthemoney.dataset import Dataset,
|
|
9
|
+
from followthemoney.dataset import Dataset, UndefinedDataset
|
|
10
10
|
from followthemoney.statement import Statement, StatementEntity
|
|
11
11
|
from followthemoney.statement import FORMATS, CSV
|
|
12
12
|
from followthemoney.statement import write_statements
|
|
@@ -16,12 +16,18 @@ from followthemoney.statement import read_path_statements
|
|
|
16
16
|
@cli.command("statements", help="Export entities to statements")
|
|
17
17
|
@click.argument("path", type=InPath)
|
|
18
18
|
@click.option("-o", "--outpath", type=OutPath, default="-")
|
|
19
|
-
@click.option("-d", "--dataset", type=str
|
|
19
|
+
@click.option("-d", "--dataset", type=str)
|
|
20
20
|
@click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
|
|
21
|
-
def entity_statements(
|
|
21
|
+
def entity_statements(
|
|
22
|
+
path: Path, outpath: Path, dataset: Optional[str], format: str
|
|
23
|
+
) -> None:
|
|
22
24
|
def make_statements() -> Generator[Statement, None, None]:
|
|
25
|
+
dataset_ = dataset or Dataset.UNDEFINED
|
|
23
26
|
for entity in path_entities(path, StatementEntity):
|
|
24
|
-
|
|
27
|
+
for stmt in Statement.from_entity(entity, dataset=dataset_):
|
|
28
|
+
if dataset is not None:
|
|
29
|
+
stmt = stmt.clone(dataset=dataset)
|
|
30
|
+
yield stmt
|
|
25
31
|
|
|
26
32
|
with path_writer(outpath) as outfh:
|
|
27
33
|
write_statements(outfh, format, make_statements())
|
|
@@ -43,12 +49,12 @@ def format_statements(
|
|
|
43
49
|
@cli.command("aggregate-statements", help="Roll up statements into entities")
|
|
44
50
|
@click.option("-i", "--infile", type=InPath, default="-")
|
|
45
51
|
@click.option("-o", "--outpath", type=OutPath, default="-")
|
|
46
|
-
@click.option("-d", "--dataset", type=str, default=
|
|
52
|
+
@click.option("-d", "--dataset", type=str, default=UndefinedDataset.name)
|
|
47
53
|
@click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
|
|
48
54
|
def statements_aggregate(
|
|
49
55
|
infile: Path, outpath: Path, dataset: str, format: str
|
|
50
56
|
) -> None:
|
|
51
|
-
dataset_ = Dataset.make({"name": dataset
|
|
57
|
+
dataset_ = Dataset.make({"name": dataset})
|
|
52
58
|
with path_writer(outpath) as outfh:
|
|
53
59
|
statements: List[Statement] = []
|
|
54
60
|
for stmt in read_path_statements(infile, format=format):
|
followthemoney/cli/util.py
CHANGED
|
@@ -6,7 +6,7 @@ import click
|
|
|
6
6
|
import orjson
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from warnings import warn
|
|
9
|
-
from typing import Any, BinaryIO, Generator, Optional, TextIO, Type
|
|
9
|
+
from typing import Any, BinaryIO, Generator, List, Optional, TextIO, Type
|
|
10
10
|
from banal import is_mapping, is_listish, ensure_list
|
|
11
11
|
|
|
12
12
|
from followthemoney.export.common import Exporter
|
|
@@ -26,7 +26,7 @@ def write_object(stream: TextIO, obj: Any) -> None:
|
|
|
26
26
|
stream.write(data + "\n")
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def write_entity(fh: BinaryIO, entity:
|
|
29
|
+
def write_entity(fh: BinaryIO, entity: EntityProxy) -> None:
|
|
30
30
|
data = entity.to_dict()
|
|
31
31
|
entity_id = data.pop("id")
|
|
32
32
|
assert entity_id is not None, data
|
|
@@ -131,7 +131,7 @@ def resolve_includes(file_path: PathLike, data: Any) -> Any:
|
|
|
131
131
|
if is_listish(data):
|
|
132
132
|
return [resolve_includes(file_path, i) for i in data]
|
|
133
133
|
if is_mapping(data):
|
|
134
|
-
include_paths = ensure_list(data.pop("include", []))
|
|
134
|
+
include_paths: List[str] = ensure_list(data.pop("include", []))
|
|
135
135
|
for include_path in include_paths:
|
|
136
136
|
dir_prefix = os.path.dirname(file_path)
|
|
137
137
|
include_path = os.path.join(dir_prefix, include_path)
|
followthemoney/compare.py
CHANGED
|
@@ -71,31 +71,18 @@ def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
|
|
|
71
71
|
return 1.0 / (1.0 + math.exp(-prob))
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def entity_is_same(left: EntityProxy, right: EntityProxy) -> bool:
|
|
75
|
-
"""Check if two entities are the same apart from their ID."""
|
|
76
|
-
if left.schema != right.schema:
|
|
77
|
-
return False
|
|
78
|
-
|
|
79
|
-
props = set(left.properties.keys()).union(right.properties.keys())
|
|
80
|
-
if 0 == len(props):
|
|
81
|
-
return False
|
|
82
|
-
|
|
83
|
-
for prop in props:
|
|
84
|
-
left_vals = sorted(left.get(prop))
|
|
85
|
-
right_vals = sorted(right.get(prop))
|
|
86
|
-
if left_vals != right_vals:
|
|
87
|
-
return False
|
|
88
|
-
return True
|
|
89
|
-
|
|
90
|
-
|
|
91
74
|
def compare(
|
|
92
75
|
left: EntityProxy,
|
|
93
76
|
right: EntityProxy,
|
|
94
77
|
weights: Weights = COMPARE_WEIGHTS,
|
|
95
78
|
) -> float:
|
|
96
79
|
"""Compare two entities and return a match score."""
|
|
97
|
-
if
|
|
98
|
-
|
|
80
|
+
if left.checksum == right.checksum:
|
|
81
|
+
# Check if there is any data at all (ie any basis for making a decision),
|
|
82
|
+
# if so, return a perfect match. This avoids marking two empty entities
|
|
83
|
+
# as matching. Bit ambiguous, but practical.
|
|
84
|
+
if len(left.properties) > 0 and len(right.properties) > 0:
|
|
85
|
+
return 1.0
|
|
99
86
|
scores = compare_scores(left, right)
|
|
100
87
|
return _compare(scores, weights)
|
|
101
88
|
|
|
@@ -4,11 +4,11 @@ from followthemoney.dataset.resource import DataResource
|
|
|
4
4
|
from followthemoney.dataset.publisher import DataPublisher
|
|
5
5
|
from followthemoney.dataset.coverage import DataCoverage
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
UndefinedDataset = Dataset.make({"name": Dataset.UNDEFINED})
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
10
10
|
"Dataset",
|
|
11
|
-
"
|
|
11
|
+
"UndefinedDataset",
|
|
12
12
|
"DataCatalog",
|
|
13
13
|
"DataResource",
|
|
14
14
|
"DataPublisher",
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
import yaml
|
|
2
3
|
import logging
|
|
3
4
|
from functools import cached_property
|
|
@@ -38,6 +39,8 @@ class DatasetModel(BaseModel):
|
|
|
38
39
|
coverage: DataCoverage | None = None
|
|
39
40
|
resources: List[DataResource] = []
|
|
40
41
|
children: Set[str] = set()
|
|
42
|
+
deprecation: Optional[str] = None
|
|
43
|
+
deprecated: bool = False
|
|
41
44
|
|
|
42
45
|
@field_validator("name", mode="after")
|
|
43
46
|
@classmethod
|
|
@@ -57,6 +60,18 @@ class DatasetModel(BaseModel):
|
|
|
57
60
|
data["children"] = children
|
|
58
61
|
return data
|
|
59
62
|
|
|
63
|
+
@model_validator(mode="after")
|
|
64
|
+
def evaluate_data(self) -> "DatasetModel":
|
|
65
|
+
# derive deprecated from deprecation notice:
|
|
66
|
+
if self.deprecation is not None:
|
|
67
|
+
self.deprecation = self.deprecation.strip()
|
|
68
|
+
if not len(self.deprecation):
|
|
69
|
+
self.deprecation = None
|
|
70
|
+
self.deprecated = self.deprecation is not None or self.deprecated
|
|
71
|
+
if self.deprecated and (self.coverage is None or self.coverage.end is None):
|
|
72
|
+
raise ValueError("Deprecated dataset coverage must have an end date.")
|
|
73
|
+
return self
|
|
74
|
+
|
|
60
75
|
def get_resource(self, name: str) -> DataResource:
|
|
61
76
|
for res in self.resources:
|
|
62
77
|
if res.name == name:
|
|
@@ -68,6 +83,8 @@ class Dataset:
|
|
|
68
83
|
"""A container for entities, often from one source or related to one topic.
|
|
69
84
|
A dataset is a set of data, sez W3C."""
|
|
70
85
|
|
|
86
|
+
UNDEFINED = "undefined"
|
|
87
|
+
|
|
71
88
|
def __init__(self: Self, data: Dict[str, Any]) -> None:
|
|
72
89
|
self.model = DatasetModel.model_validate(data)
|
|
73
90
|
self.name = self.model.name
|
|
@@ -121,10 +138,13 @@ class Dataset:
|
|
|
121
138
|
) -> DS:
|
|
122
139
|
from followthemoney.dataset.catalog import DataCatalog
|
|
123
140
|
|
|
141
|
+
path = Path(path)
|
|
124
142
|
with open(path, "r") as fh:
|
|
125
143
|
data = yaml.safe_load(fh)
|
|
126
144
|
if catalog is None:
|
|
127
145
|
catalog = DataCatalog(cls, {})
|
|
146
|
+
if "name" not in data:
|
|
147
|
+
data["name"] = path.stem
|
|
128
148
|
return catalog.make_dataset(data)
|
|
129
149
|
|
|
130
150
|
@classmethod
|
followthemoney/entity.py
CHANGED
|
@@ -5,6 +5,7 @@ from rigour.names import pick_name
|
|
|
5
5
|
from followthemoney.proxy import EntityProxy
|
|
6
6
|
from followthemoney.schema import Schema
|
|
7
7
|
from followthemoney.statement import BASE_ID, Statement
|
|
8
|
+
from followthemoney.util import HASH_ENCODING
|
|
8
9
|
|
|
9
10
|
VE = TypeVar("VE", bound="ValueEntity")
|
|
10
11
|
|
|
@@ -81,6 +82,19 @@ class ValueEntity(EntityProxy):
|
|
|
81
82
|
merged.last_change = max(changed, default=None)
|
|
82
83
|
return merged
|
|
83
84
|
|
|
85
|
+
@property
|
|
86
|
+
def checksum(self) -> str:
|
|
87
|
+
digest = self._checksum_digest()
|
|
88
|
+
for dataset in sorted(self.datasets):
|
|
89
|
+
digest.update(dataset.encode(HASH_ENCODING))
|
|
90
|
+
digest.update(b"\x1e")
|
|
91
|
+
for referent in sorted(self.referents):
|
|
92
|
+
digest.update(referent.encode(HASH_ENCODING))
|
|
93
|
+
digest.update(b"\x1e")
|
|
94
|
+
if self.last_change is not None:
|
|
95
|
+
digest.update(self.last_change.encode(HASH_ENCODING))
|
|
96
|
+
return digest.hexdigest()
|
|
97
|
+
|
|
84
98
|
def to_dict(self) -> Dict[str, Any]:
|
|
85
99
|
data = super().to_dict()
|
|
86
100
|
data["referents"] = list(self.referents)
|
followthemoney/mapping/csv.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, cast
|
|
|
9
9
|
from typing import Any, Dict, Generator, ItemsView, Iterable, List, Optional, Set, Tuple
|
|
10
10
|
|
|
11
11
|
from followthemoney.mapping.source import Record, Source
|
|
12
|
+
from followthemoney.settings import USER_AGENT
|
|
12
13
|
from followthemoney.util import sanitize_text
|
|
13
14
|
from followthemoney.exc import InvalidMapping
|
|
14
15
|
|
|
@@ -64,7 +65,8 @@ class CSVSource(Source):
|
|
|
64
65
|
parsed_url = urlparse(url)
|
|
65
66
|
log.info("Loading: %s", url)
|
|
66
67
|
if parsed_url.scheme in ["http", "https"]:
|
|
67
|
-
|
|
68
|
+
headers = {"User-Agent": USER_AGENT}
|
|
69
|
+
res = requests.get(url, stream=True, headers=headers)
|
|
68
70
|
if not res.ok:
|
|
69
71
|
raise InvalidMapping("Failed to open CSV: %s" % url)
|
|
70
72
|
# if res.encoding is None:
|
followthemoney/model.py
CHANGED
|
@@ -3,12 +3,14 @@ import yaml
|
|
|
3
3
|
from functools import cache
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
from typing import Dict, Generator, Iterator, Optional, Set, TypedDict, Union
|
|
6
|
+
from rigour.env import ENCODING
|
|
6
7
|
|
|
7
8
|
from followthemoney.types import registry
|
|
8
9
|
from followthemoney.types.common import PropertyType, PropertyTypeToDict
|
|
9
10
|
from followthemoney.schema import Schema, SchemaToDict
|
|
10
11
|
from followthemoney.property import Property
|
|
11
12
|
from followthemoney.exc import InvalidModel, InvalidData
|
|
13
|
+
from followthemoney.settings import MODEL_PATH
|
|
12
14
|
from followthemoney.util import const
|
|
13
15
|
|
|
14
16
|
if TYPE_CHECKING:
|
|
@@ -47,10 +49,7 @@ class Model(object):
|
|
|
47
49
|
@classmethod
|
|
48
50
|
def instance(cls) -> "Model":
|
|
49
51
|
if cls._instance is None:
|
|
50
|
-
|
|
51
|
-
model_path = os.path.join(model_path, "schema")
|
|
52
|
-
model_path = os.environ.get("FTM_MODEL_PATH", model_path)
|
|
53
|
-
cls._instance = cls(model_path)
|
|
52
|
+
cls._instance = cls(MODEL_PATH)
|
|
54
53
|
return cls._instance
|
|
55
54
|
|
|
56
55
|
def generate(self) -> None:
|
|
@@ -68,7 +67,7 @@ class Model(object):
|
|
|
68
67
|
schema.properties[prop.name] = prop
|
|
69
68
|
|
|
70
69
|
def _load(self, filepath: str) -> None:
|
|
71
|
-
with open(filepath, "r", encoding=
|
|
70
|
+
with open(filepath, "r", encoding=ENCODING) as fh:
|
|
72
71
|
data = yaml.safe_load(fh)
|
|
73
72
|
if not isinstance(data, dict):
|
|
74
73
|
raise InvalidModel("Model file is not a mapping: %s" % filepath)
|
followthemoney/proxy.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import logging
|
|
2
3
|
from typing import TYPE_CHECKING, cast, Any
|
|
3
4
|
from typing import Dict, Generator, List, Optional, Set, Tuple, Union, Type, TypeVar
|
|
@@ -10,13 +11,14 @@ from followthemoney.types import registry
|
|
|
10
11
|
from followthemoney.types.common import PropertyType
|
|
11
12
|
from followthemoney.property import Property
|
|
12
13
|
from followthemoney.value import string_list, Values
|
|
13
|
-
from followthemoney.util import sanitize_text, gettext
|
|
14
|
+
from followthemoney.util import HASH_ENCODING, sanitize_text, gettext
|
|
14
15
|
from followthemoney.util import merge_context, make_entity_id
|
|
15
16
|
from followthemoney.model import Model
|
|
16
17
|
from followthemoney.schema import Schema
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
19
20
|
from followthemoney.model import Model
|
|
21
|
+
from hashlib import _Hash
|
|
20
22
|
|
|
21
23
|
log = logging.getLogger(__name__)
|
|
22
24
|
P = Union[Property, str]
|
|
@@ -437,6 +439,28 @@ class EntityProxy(object):
|
|
|
437
439
|
self.add(prop, values, cleaned=True, quiet=True)
|
|
438
440
|
return self
|
|
439
441
|
|
|
442
|
+
def _checksum_digest(self) -> "_Hash":
|
|
443
|
+
"""Create a SHA1 digest of the entity's ID, schema and properties for
|
|
444
|
+
change detection. This is returned as a hashlib digest object so that
|
|
445
|
+
it can be subclassed."""
|
|
446
|
+
digest = hashlib.sha1()
|
|
447
|
+
if self.id is not None:
|
|
448
|
+
digest.update(self.id.encode(HASH_ENCODING))
|
|
449
|
+
digest.update(self.schema.name.encode(HASH_ENCODING))
|
|
450
|
+
for prop in sorted(self._properties.keys()):
|
|
451
|
+
digest.update(prop.encode(HASH_ENCODING))
|
|
452
|
+
for value in sorted(self._properties[prop]):
|
|
453
|
+
digest.update(value.encode(HASH_ENCODING))
|
|
454
|
+
digest.update(b"\x1e")
|
|
455
|
+
digest.update(b"\x1f")
|
|
456
|
+
return digest
|
|
457
|
+
|
|
458
|
+
@property
|
|
459
|
+
def checksum(self) -> str:
|
|
460
|
+
"""A SHA1 checksum hexdigest representing the current state of the
|
|
461
|
+
entity proxy. This can be used for change detection."""
|
|
462
|
+
return self._checksum_digest().hexdigest()
|
|
463
|
+
|
|
440
464
|
def __getstate__(self) -> Dict[str, Any]:
|
|
441
465
|
data = {slot: getattr(self, slot) for slot in self.__slots__}
|
|
442
466
|
data["schema"] = self.schema.name
|
|
@@ -460,13 +484,13 @@ class EntityProxy(object):
|
|
|
460
484
|
|
|
461
485
|
def __hash__(self) -> int:
|
|
462
486
|
if self.id is None:
|
|
463
|
-
raise RuntimeError("
|
|
487
|
+
raise RuntimeError("Unhashable entity proxy without ID.")
|
|
464
488
|
return hash(self.id)
|
|
465
489
|
|
|
466
490
|
def __eq__(self, other: Any) -> bool:
|
|
467
491
|
try:
|
|
468
492
|
if self.id is None or other.id is None:
|
|
469
|
-
raise RuntimeError("Cannot compare
|
|
493
|
+
raise RuntimeError("Cannot compare entity proxies without IDs.")
|
|
470
494
|
return bool(self.id == other.id)
|
|
471
495
|
except AttributeError:
|
|
472
496
|
return False
|
followthemoney/schema/Image.yaml
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
Image:
|
|
2
|
+
# This schema defines an image file entity within the FollowTheMoney data model.
|
|
3
|
+
# If a `checksum` property is present, consider loading it from an Aleph archive
|
|
4
|
+
# or FtM data lake. Otherwise, use `sourceUrl` to fetch the image directly.
|
|
2
5
|
extends:
|
|
3
6
|
- Document
|
|
4
7
|
label: Image
|
|
@@ -23,3 +26,7 @@ Image:
|
|
|
23
26
|
label: "Images"
|
|
24
27
|
type: entity
|
|
25
28
|
range: Person
|
|
29
|
+
credit:
|
|
30
|
+
label: "Credit"
|
|
31
|
+
description: "The credit or attribution for the image."
|
|
32
|
+
type: string
|
|
@@ -18,6 +18,7 @@ LegalEntity:
|
|
|
18
18
|
caption:
|
|
19
19
|
- name
|
|
20
20
|
- alias
|
|
21
|
+
- abbreviation
|
|
21
22
|
- weakAlias
|
|
22
23
|
- previousName
|
|
23
24
|
- email
|
|
@@ -29,6 +30,12 @@ LegalEntity:
|
|
|
29
30
|
end:
|
|
30
31
|
- dissolutionDate
|
|
31
32
|
properties:
|
|
33
|
+
abbreviation:
|
|
34
|
+
label: Abbreviation
|
|
35
|
+
type: name
|
|
36
|
+
description: "Abbreviated name or acronym"
|
|
37
|
+
# TODO: is un-matchable wise? The idea is to handle it like `weakAlias` rather than `alias`.
|
|
38
|
+
matchable: false
|
|
32
39
|
email:
|
|
33
40
|
label: E-Mail
|
|
34
41
|
type: email
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from typing import List
|
|
4
|
+
from rigour.env import env_opt, env_str
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_env_list(name: str, default: List[str] = []) -> List[str]:
|
|
8
|
+
value = env_opt(name)
|
|
9
|
+
if value is not None:
|
|
10
|
+
values = value.split(":")
|
|
11
|
+
if len(values):
|
|
12
|
+
return values
|
|
13
|
+
return default
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
MODEL_PATH = os.path.join(os.path.dirname(__file__), "schema")
|
|
17
|
+
MODEL_PATH = env_str("FTM_MODEL_PATH", MODEL_PATH)
|
|
18
|
+
|
|
19
|
+
USER_AGENT = env_str("FTM_USER_AGENT", requests.utils.default_user_agent())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from hashlib import sha1
|
|
2
2
|
from collections.abc import Mapping
|
|
3
|
-
from typing import Any, Dict, List, Optional, Set, Type
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type
|
|
4
4
|
from typing import Generator, Iterable, Tuple, TypeVar
|
|
5
5
|
from rigour.langs import LangStr
|
|
6
6
|
from rigour.names.pick import pick_lang_name
|
|
@@ -10,17 +10,20 @@ from followthemoney.exc import InvalidData
|
|
|
10
10
|
from followthemoney.schema import Schema
|
|
11
11
|
from followthemoney.types.common import PropertyType
|
|
12
12
|
from followthemoney.property import Property
|
|
13
|
-
from followthemoney.util import gettext
|
|
13
|
+
from followthemoney.util import HASH_ENCODING, gettext
|
|
14
14
|
from followthemoney.proxy import P
|
|
15
15
|
from followthemoney.types import registry
|
|
16
16
|
from followthemoney.value import string_list, Values
|
|
17
17
|
from followthemoney.proxy import EntityProxy
|
|
18
|
-
from followthemoney.dataset import Dataset,
|
|
18
|
+
from followthemoney.dataset import Dataset, UndefinedDataset
|
|
19
19
|
from followthemoney.statement.statement import Statement
|
|
20
20
|
from followthemoney.statement.util import BASE_ID
|
|
21
21
|
|
|
22
22
|
SE = TypeVar("SE", bound="StatementEntity")
|
|
23
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from hashlib import _Hash
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
class StatementEntity(EntityProxy):
|
|
26
29
|
"""An entity object that can link to a set of datasets that it is sourced from."""
|
|
@@ -35,7 +38,12 @@ class StatementEntity(EntityProxy):
|
|
|
35
38
|
"_statements",
|
|
36
39
|
)
|
|
37
40
|
|
|
38
|
-
def __init__(
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
dataset: Dataset,
|
|
44
|
+
data: Dict[str, Any],
|
|
45
|
+
cleaned: bool = True,
|
|
46
|
+
) -> None:
|
|
39
47
|
data = dict(data or {})
|
|
40
48
|
schema = Model.instance().get(data.pop("schema", None))
|
|
41
49
|
if schema is None:
|
|
@@ -76,8 +84,7 @@ class StatementEntity(EntityProxy):
|
|
|
76
84
|
for stmts in self._statements.values():
|
|
77
85
|
for stmt in stmts:
|
|
78
86
|
if stmt.entity_id is None and self.id is not None:
|
|
79
|
-
stmt.entity_id
|
|
80
|
-
stmt.id = stmt.generate_key()
|
|
87
|
+
stmt = stmt.clone(entity_id=self.id)
|
|
81
88
|
if stmt.id is None:
|
|
82
89
|
stmt.id = stmt.generate_key()
|
|
83
90
|
yield stmt
|
|
@@ -97,9 +104,9 @@ class StatementEntity(EntityProxy):
|
|
|
97
104
|
if stmt.first_seen is not None:
|
|
98
105
|
first_seen.add(stmt.first_seen)
|
|
99
106
|
if self.id is not None:
|
|
100
|
-
digest = sha1(self.schema.name.encode(
|
|
107
|
+
digest = sha1(self.schema.name.encode(HASH_ENCODING))
|
|
101
108
|
for id in sorted(ids):
|
|
102
|
-
digest.update(id.encode(
|
|
109
|
+
digest.update(id.encode(HASH_ENCODING))
|
|
103
110
|
checksum = digest.hexdigest()
|
|
104
111
|
# This is to make the last_change value stable across
|
|
105
112
|
# serialisation:
|
|
@@ -183,6 +190,11 @@ class StatementEntity(EntityProxy):
|
|
|
183
190
|
return []
|
|
184
191
|
return list(self._statements[prop_name])
|
|
185
192
|
|
|
193
|
+
@property
|
|
194
|
+
def has_statements(self) -> bool:
|
|
195
|
+
"""Return whether the entity has any statements."""
|
|
196
|
+
return len(self._statements) > 0
|
|
197
|
+
|
|
186
198
|
def set(
|
|
187
199
|
self,
|
|
188
200
|
prop: P,
|
|
@@ -426,7 +438,7 @@ class StatementEntity(EntityProxy):
|
|
|
426
438
|
origins.add(stmt.origin)
|
|
427
439
|
|
|
428
440
|
data["referents"] = list(referents)
|
|
429
|
-
data["datasets"] =
|
|
441
|
+
data["datasets"] = [d for d in datasets if d != Dataset.UNDEFINED]
|
|
430
442
|
if origins:
|
|
431
443
|
data["origin"] = list(origins)
|
|
432
444
|
|
|
@@ -449,6 +461,23 @@ class StatementEntity(EntityProxy):
|
|
|
449
461
|
data["statements"] = [stmt.to_dict() for stmt in self.statements]
|
|
450
462
|
return data
|
|
451
463
|
|
|
464
|
+
def _checksum_digest(self) -> "_Hash":
|
|
465
|
+
"""Create a SHA1 digest of the entity's ID, schema and properties for
|
|
466
|
+
change detection. This is returned as a hashlib digest object so that
|
|
467
|
+
it can be subclassed."""
|
|
468
|
+
digest = sha1()
|
|
469
|
+
if self.id is not None:
|
|
470
|
+
digest.update(self.id.encode(HASH_ENCODING))
|
|
471
|
+
statement_ids: List[str] = []
|
|
472
|
+
for stmts in self._statements.values():
|
|
473
|
+
for stmt in stmts:
|
|
474
|
+
if stmt.id is not None:
|
|
475
|
+
statement_ids.append(stmt.id)
|
|
476
|
+
for stmt_id in sorted(statement_ids):
|
|
477
|
+
digest.update(stmt_id.encode(HASH_ENCODING))
|
|
478
|
+
digest.update(b"\x1e")
|
|
479
|
+
return digest
|
|
480
|
+
|
|
452
481
|
def __len__(self) -> int:
|
|
453
482
|
return len(list(self._iter_stmt())) + 1
|
|
454
483
|
|
|
@@ -460,7 +489,7 @@ class StatementEntity(EntityProxy):
|
|
|
460
489
|
default_dataset: Optional[Dataset] = None,
|
|
461
490
|
) -> SE:
|
|
462
491
|
# Exists only for backwards compatibility.
|
|
463
|
-
dataset = default_dataset or
|
|
492
|
+
dataset = default_dataset or UndefinedDataset
|
|
464
493
|
return cls(dataset, data, cleaned=cleaned)
|
|
465
494
|
|
|
466
495
|
@classmethod
|