followthemoney 3.8.4__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/__init__.py +3 -12
- followthemoney/cli/aggregate.py +1 -1
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/mapping.py +6 -4
- followthemoney/cli/sieve.py +1 -1
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +137 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +58 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/common.py +3 -3
- followthemoney/export/csv.py +10 -12
- followthemoney/export/neo4j.py +1 -1
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +6 -4
- followthemoney/mapping/csv.py +6 -18
- followthemoney/mapping/sql.py +3 -4
- followthemoney/model.py +36 -9
- followthemoney/namespace.py +3 -1
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +44 -65
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +27 -39
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +437 -0
- followthemoney/statement/serialize.py +245 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/METADATA +38 -34
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/RECORD +78 -69
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/entry_points.txt +1 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/licenses/LICENSE +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/WHEEL +0 -0
followthemoney/schema.py
CHANGED
|
@@ -1,22 +1,12 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
|
|
3
|
-
Any,
|
|
4
|
-
Dict,
|
|
5
|
-
List,
|
|
6
|
-
Optional,
|
|
7
|
-
Set,
|
|
8
|
-
TypedDict,
|
|
9
|
-
Union,
|
|
10
|
-
cast,
|
|
11
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
2
|
+
from typing import Dict, List, Optional, Set, TypedDict, Union
|
|
12
3
|
from banal import ensure_list, ensure_dict, as_bool
|
|
13
4
|
from functools import lru_cache
|
|
14
5
|
|
|
15
6
|
from followthemoney.property import Property, PropertySpec, PropertyToDict, ReverseSpec
|
|
16
7
|
from followthemoney.types import registry
|
|
17
8
|
from followthemoney.exc import InvalidData, InvalidModel
|
|
18
|
-
from followthemoney.
|
|
19
|
-
from followthemoney.util import gettext
|
|
9
|
+
from followthemoney.util import gettext, const
|
|
20
10
|
|
|
21
11
|
if TYPE_CHECKING:
|
|
22
12
|
from followthemoney.model import Model
|
|
@@ -47,7 +37,6 @@ class SchemaSpec(TypedDict, total=False):
|
|
|
47
37
|
edge: EdgeSpec
|
|
48
38
|
temporalExtent: TemporalExtentSpec
|
|
49
39
|
description: Optional[str]
|
|
50
|
-
rdf: Optional[str]
|
|
51
40
|
abstract: bool
|
|
52
41
|
hidden: bool
|
|
53
42
|
generated: bool
|
|
@@ -90,7 +79,6 @@ class Schema:
|
|
|
90
79
|
"_plural",
|
|
91
80
|
"_description",
|
|
92
81
|
"_hash",
|
|
93
|
-
"uri",
|
|
94
82
|
"abstract",
|
|
95
83
|
"hidden",
|
|
96
84
|
"generated",
|
|
@@ -118,15 +106,12 @@ class Schema:
|
|
|
118
106
|
|
|
119
107
|
def __init__(self, model: "Model", name: str, data: SchemaSpec) -> None:
|
|
120
108
|
#: Machine-readable name of the schema, used for identification.
|
|
121
|
-
self.name = name
|
|
109
|
+
self.name = const(name)
|
|
122
110
|
self.model = model
|
|
123
111
|
self._label = data.get("label", name)
|
|
124
112
|
self._plural = data.get("plural", self.label)
|
|
125
113
|
self._description = data.get("description")
|
|
126
|
-
self._hash = hash("<Schema(%r)>" % name)
|
|
127
|
-
|
|
128
|
-
#: RDF identifier for this schema when it is transformed to a triple term.
|
|
129
|
-
self.uri = URIRef(cast(str, data.get("rdf", NS[name])))
|
|
114
|
+
self._hash = hash("<Schema(%r)>" % self.name)
|
|
130
115
|
|
|
131
116
|
#: Do not store or emit entities of this type, it is used only for
|
|
132
117
|
#: inheritance.
|
|
@@ -152,17 +137,17 @@ class Schema:
|
|
|
152
137
|
#: Mark a set of properties as important, i.e. they should be shown
|
|
153
138
|
#: first, or in an abridged view of the entity. In Aleph, these properties
|
|
154
139
|
#: are included in tabular entity listings.
|
|
155
|
-
self.featured =
|
|
140
|
+
self.featured = [const(f) for f in data.get("featured", [])]
|
|
156
141
|
|
|
157
142
|
#: Mark a set of properties as required. This is applied only when
|
|
158
143
|
#: an entity is created by the user - bulk created entities will
|
|
159
144
|
#: slip through even if it is technically invalid.
|
|
160
|
-
self.required =
|
|
145
|
+
self.required = [const(r) for r in data.get("required", [])]
|
|
161
146
|
|
|
162
147
|
#: Mark a set of properties to be used for the entity's caption.
|
|
163
148
|
#: They will be checked in order and the first existent value will
|
|
164
149
|
#: be used.
|
|
165
|
-
self.caption =
|
|
150
|
+
self.caption = [const(s) for s in data.get("caption", [])]
|
|
166
151
|
|
|
167
152
|
# A transform of the entity into an edge for its representation in
|
|
168
153
|
# the context of a property graph representation like Neo4J/Gephi.
|
|
@@ -173,7 +158,7 @@ class Schema:
|
|
|
173
158
|
#: Flag to indicate if this schema should be represented by an edge (rather than
|
|
174
159
|
#: a node) when the data is converted into a property graph.
|
|
175
160
|
self.edge: bool = self.edge_source is not None and self.edge_target is not None
|
|
176
|
-
self.edge_caption =
|
|
161
|
+
self.edge_caption = [const(p) for p in edge.get("caption", [])]
|
|
177
162
|
self._edge_label = edge.get("label", self._label)
|
|
178
163
|
|
|
179
164
|
#: Flag to indicate if the edge should be presented as directed to the user,
|
|
@@ -183,16 +168,16 @@ class Schema:
|
|
|
183
168
|
#: Specify which properties should be used to represent this schema in a
|
|
184
169
|
#: timeline.
|
|
185
170
|
temporal_extent = data.get("temporalExtent", {})
|
|
186
|
-
self._temporal_start =
|
|
187
|
-
self._temporal_end =
|
|
171
|
+
self._temporal_start = [const(s) for s in temporal_extent.get("start", [])]
|
|
172
|
+
self._temporal_end = [const(e) for e in temporal_extent.get("end", [])]
|
|
188
173
|
|
|
189
174
|
#: Direct parent schemata of this schema.
|
|
190
|
-
self._extends =
|
|
175
|
+
self._extends = [const(s) for s in data.get("extends", [])]
|
|
191
176
|
self.extends: Set["Schema"] = set()
|
|
192
177
|
|
|
193
178
|
#: All parents of this schema (including indirect parents and the schema
|
|
194
179
|
#: itself).
|
|
195
|
-
self.schemata = set([self])
|
|
180
|
+
self.schemata: Set[Schema] = set([self])
|
|
196
181
|
|
|
197
182
|
#: All names of :attr:`~schemata`.
|
|
198
183
|
self.names = set([self.name])
|
|
@@ -205,8 +190,8 @@ class Schema:
|
|
|
205
190
|
#: The full list of properties defined for the entity, including those
|
|
206
191
|
#: inherited from parent schemata.
|
|
207
192
|
self.properties: Dict[str, Property] = {}
|
|
208
|
-
for
|
|
209
|
-
self.properties[
|
|
193
|
+
for pname, prop in data.get("properties", {}).items():
|
|
194
|
+
self.properties[pname] = Property(self, pname, prop)
|
|
210
195
|
|
|
211
196
|
def generate(self, model: "Model") -> None:
|
|
212
197
|
"""While loading the schema, this function will validate and
|
|
@@ -317,12 +302,18 @@ class Schema:
|
|
|
317
302
|
|
|
318
303
|
@property
|
|
319
304
|
def source_prop(self) -> Optional[Property]:
|
|
320
|
-
"""The entity property to be used as an edge source
|
|
305
|
+
"""The entity property to be used as an edge source when the schema is
|
|
306
|
+
considered as a relationship."""
|
|
307
|
+
if self.edge_source is None:
|
|
308
|
+
return None
|
|
321
309
|
return self.get(self.edge_source)
|
|
322
310
|
|
|
323
311
|
@property
|
|
324
312
|
def target_prop(self) -> Optional[Property]:
|
|
325
|
-
"""The entity property to be used as an edge target
|
|
313
|
+
"""The entity property to be used as an edge target when the schema is transformed
|
|
314
|
+
into a relationship."""
|
|
315
|
+
if self.edge_target is None:
|
|
316
|
+
return None
|
|
326
317
|
return self.get(self.edge_target)
|
|
327
318
|
|
|
328
319
|
@property
|
|
@@ -404,13 +395,13 @@ class Schema:
|
|
|
404
395
|
other = other.name
|
|
405
396
|
return other in self.names
|
|
406
397
|
|
|
407
|
-
def get(self, name:
|
|
398
|
+
def get(self, name: str) -> Optional[Property]:
|
|
408
399
|
"""Retrieve a property defined for this schema by its name."""
|
|
409
400
|
if name is None:
|
|
410
401
|
return None
|
|
411
402
|
return self.properties.get(name)
|
|
412
403
|
|
|
413
|
-
def validate(self, data: Any) -> Optional[str]:
|
|
404
|
+
def validate(self, data: Dict[str, Any]) -> Optional[str]:
|
|
414
405
|
"""Validate a dictionary against the given schema.
|
|
415
406
|
This will also drop keys which are not valid as properties.
|
|
416
407
|
"""
|
|
@@ -478,7 +469,7 @@ class Schema:
|
|
|
478
469
|
def __eq__(self, other: Any) -> bool:
|
|
479
470
|
"""Compare two schemata (via hash)."""
|
|
480
471
|
try:
|
|
481
|
-
return self._hash ==
|
|
472
|
+
return self._hash == other._hash # type: ignore
|
|
482
473
|
except AttributeError:
|
|
483
474
|
return False
|
|
484
475
|
|
|
@@ -486,10 +477,7 @@ class Schema:
|
|
|
486
477
|
return self.name.__lt__(other.name)
|
|
487
478
|
|
|
488
479
|
def __hash__(self) -> int:
|
|
489
|
-
|
|
490
|
-
return self._hash
|
|
491
|
-
except AttributeError:
|
|
492
|
-
return super().__hash__()
|
|
480
|
+
return self._hash
|
|
493
481
|
|
|
494
482
|
def __repr__(self) -> str:
|
|
495
483
|
return "<Schema(%r)>" % self.name
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from followthemoney.statement.statement import Statement, StatementDict
|
|
2
|
+
from followthemoney.statement.serialize import CSV, JSON, PACK, FORMATS
|
|
3
|
+
from followthemoney.statement.serialize import write_statements
|
|
4
|
+
from followthemoney.statement.serialize import read_statements, read_path_statements
|
|
5
|
+
from followthemoney.statement.entity import SE, StatementEntity
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Statement",
|
|
9
|
+
"StatementDict",
|
|
10
|
+
"StatementEntity",
|
|
11
|
+
"SE",
|
|
12
|
+
"CSV",
|
|
13
|
+
"JSON",
|
|
14
|
+
"PACK",
|
|
15
|
+
"FORMATS",
|
|
16
|
+
"write_statements",
|
|
17
|
+
"read_statements",
|
|
18
|
+
"read_path_statements",
|
|
19
|
+
]
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
from hashlib import sha1
|
|
2
|
+
from collections.abc import Mapping
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set, Type
|
|
4
|
+
from typing import Generator, Iterable, Tuple, TypeVar
|
|
5
|
+
|
|
6
|
+
from followthemoney.model import Model
|
|
7
|
+
from followthemoney.exc import InvalidData
|
|
8
|
+
from followthemoney.types.common import PropertyType
|
|
9
|
+
from followthemoney.property import Property
|
|
10
|
+
from followthemoney.util import gettext
|
|
11
|
+
from followthemoney.proxy import P
|
|
12
|
+
from followthemoney.types import registry
|
|
13
|
+
from followthemoney.value import string_list, Values
|
|
14
|
+
from followthemoney.proxy import EntityProxy
|
|
15
|
+
from followthemoney.dataset import Dataset, DefaultDataset
|
|
16
|
+
from followthemoney.statement.statement import Statement
|
|
17
|
+
from followthemoney.statement.util import BASE_ID
|
|
18
|
+
|
|
19
|
+
SE = TypeVar("SE", bound="StatementEntity")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class StatementEntity(EntityProxy):
|
|
23
|
+
"""An entity object that can link to a set of datasets that it is sourced from."""
|
|
24
|
+
|
|
25
|
+
__slots__ = (
|
|
26
|
+
"schema",
|
|
27
|
+
"id",
|
|
28
|
+
"_caption",
|
|
29
|
+
"extra_referents",
|
|
30
|
+
"dataset",
|
|
31
|
+
"last_change",
|
|
32
|
+
"_statements",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def __init__(self, dataset: Dataset, data: Dict[str, Any], cleaned: bool = True):
|
|
36
|
+
data = dict(data or {})
|
|
37
|
+
schema = Model.instance().get(data.pop("schema", None))
|
|
38
|
+
if schema is None:
|
|
39
|
+
raise InvalidData(gettext("No schema for entity."))
|
|
40
|
+
self.schema = schema
|
|
41
|
+
|
|
42
|
+
self._caption: Optional[str] = None
|
|
43
|
+
"""A pre-computed label for this entity."""
|
|
44
|
+
|
|
45
|
+
self.extra_referents: Set[str] = set(data.pop("referents", []))
|
|
46
|
+
"""The IDs of all entities which are included in this canonical entity."""
|
|
47
|
+
|
|
48
|
+
self.last_change: Optional[str] = data.get("last_change", None)
|
|
49
|
+
"""The last time this entity was changed."""
|
|
50
|
+
|
|
51
|
+
self.dataset = dataset
|
|
52
|
+
"""The default dataset for new statements."""
|
|
53
|
+
|
|
54
|
+
self.id: Optional[str] = data.pop("id", None)
|
|
55
|
+
self._statements: Dict[str, Set[Statement]] = {}
|
|
56
|
+
|
|
57
|
+
properties = data.pop("properties", None)
|
|
58
|
+
if isinstance(properties, Mapping):
|
|
59
|
+
for key, value in properties.items():
|
|
60
|
+
self.add(key, value, cleaned=cleaned, quiet=True)
|
|
61
|
+
|
|
62
|
+
for stmt_data in data.pop("statements", []):
|
|
63
|
+
stmt = Statement.from_dict(stmt_data)
|
|
64
|
+
if self.id is not None:
|
|
65
|
+
stmt.canonical_id = self.id
|
|
66
|
+
self.add_statement(stmt)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def _properties(self) -> Dict[str, List[str]]: # type: ignore
|
|
70
|
+
return {p: [s.value for s in v] for p, v in self._statements.items()}
|
|
71
|
+
|
|
72
|
+
def _iter_stmt(self) -> Generator[Statement, None, None]:
|
|
73
|
+
for stmts in self._statements.values():
|
|
74
|
+
for stmt in stmts:
|
|
75
|
+
if stmt.entity_id is None and self.id is not None:
|
|
76
|
+
stmt.entity_id = self.id
|
|
77
|
+
stmt.id = stmt.generate_key()
|
|
78
|
+
if stmt.id is None:
|
|
79
|
+
stmt.id = stmt.generate_key()
|
|
80
|
+
yield stmt
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def statements(self) -> Generator[Statement, None, None]:
|
|
84
|
+
"""Return all statements for this entity, with extra ID statement."""
|
|
85
|
+
ids: List[str] = []
|
|
86
|
+
last_seen: Set[str] = set()
|
|
87
|
+
first_seen: Set[str] = set()
|
|
88
|
+
for stmt in self._iter_stmt():
|
|
89
|
+
yield stmt
|
|
90
|
+
if stmt.id is not None:
|
|
91
|
+
ids.append(stmt.id)
|
|
92
|
+
if stmt.last_seen is not None:
|
|
93
|
+
last_seen.add(stmt.last_seen)
|
|
94
|
+
if stmt.first_seen is not None:
|
|
95
|
+
first_seen.add(stmt.first_seen)
|
|
96
|
+
if self.id is not None:
|
|
97
|
+
digest = sha1(self.schema.name.encode("utf-8"))
|
|
98
|
+
for id in sorted(ids):
|
|
99
|
+
digest.update(id.encode("utf-8"))
|
|
100
|
+
checksum = digest.hexdigest()
|
|
101
|
+
# This is to make the last_change value stable across
|
|
102
|
+
# serialisation:
|
|
103
|
+
first = self.last_change or min(first_seen, default=None)
|
|
104
|
+
yield Statement(
|
|
105
|
+
canonical_id=self.id,
|
|
106
|
+
entity_id=self.id,
|
|
107
|
+
prop=BASE_ID,
|
|
108
|
+
schema=self.schema.name,
|
|
109
|
+
value=checksum,
|
|
110
|
+
dataset=self.dataset.name,
|
|
111
|
+
first_seen=first,
|
|
112
|
+
last_seen=max(last_seen, default=None),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def first_seen(self) -> Optional[str]:
|
|
117
|
+
seen = (s.first_seen for s in self._iter_stmt() if s.first_seen is not None)
|
|
118
|
+
return min(seen, default=None)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def last_seen(self) -> Optional[str]:
|
|
122
|
+
seen = (s.last_seen for s in self._iter_stmt() if s.last_seen is not None)
|
|
123
|
+
return max(seen, default=None)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def datasets(self) -> Set[str]:
|
|
127
|
+
datasets: Set[str] = set()
|
|
128
|
+
for stmt in self._iter_stmt():
|
|
129
|
+
datasets.add(stmt.dataset)
|
|
130
|
+
return datasets
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def referents(self) -> Set[str]:
|
|
134
|
+
referents: Set[str] = set(self.extra_referents)
|
|
135
|
+
for stmt in self._iter_stmt():
|
|
136
|
+
if stmt.entity_id is not None and stmt.entity_id != self.id:
|
|
137
|
+
referents.add(stmt.entity_id)
|
|
138
|
+
return referents
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def key_prefix(self) -> Optional[str]:
|
|
142
|
+
return self.dataset.name
|
|
143
|
+
|
|
144
|
+
@key_prefix.setter
|
|
145
|
+
def key_prefix(self, dataset: Optional[str]) -> None:
|
|
146
|
+
raise NotImplementedError()
|
|
147
|
+
|
|
148
|
+
def add_statement(self, stmt: Statement) -> None:
|
|
149
|
+
schema = self.schema
|
|
150
|
+
if not schema.is_a(stmt.schema):
|
|
151
|
+
try:
|
|
152
|
+
self.schema = schema.model.common_schema(schema, stmt.schema)
|
|
153
|
+
except InvalidData as exc:
|
|
154
|
+
raise InvalidData(f"{self.id}: {exc}") from exc
|
|
155
|
+
|
|
156
|
+
if stmt.prop == BASE_ID:
|
|
157
|
+
if stmt.first_seen is not None:
|
|
158
|
+
# The last_change attribute describes the latest checksum change
|
|
159
|
+
# of any emitted component of the entity, which is stored in the BASE
|
|
160
|
+
# field.
|
|
161
|
+
if self.last_change is None:
|
|
162
|
+
self.last_change = stmt.first_seen
|
|
163
|
+
else:
|
|
164
|
+
self.last_change = max(self.last_change, stmt.first_seen)
|
|
165
|
+
else:
|
|
166
|
+
self._statements.setdefault(stmt.prop, set())
|
|
167
|
+
self._statements[stmt.prop].add(stmt)
|
|
168
|
+
|
|
169
|
+
def get(self, prop: P, quiet: bool = False) -> List[str]:
|
|
170
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
171
|
+
if prop_name is None or prop_name not in self._statements:
|
|
172
|
+
return []
|
|
173
|
+
return list({s.value for s in self._statements[prop_name]})
|
|
174
|
+
|
|
175
|
+
def get_statements(self, prop: P, quiet: bool = False) -> List[Statement]:
|
|
176
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
177
|
+
if prop_name is None or prop_name not in self._statements:
|
|
178
|
+
return []
|
|
179
|
+
return list(self._statements[prop_name])
|
|
180
|
+
|
|
181
|
+
def set(
|
|
182
|
+
self,
|
|
183
|
+
prop: P,
|
|
184
|
+
values: Values,
|
|
185
|
+
cleaned: bool = False,
|
|
186
|
+
quiet: bool = False,
|
|
187
|
+
fuzzy: bool = False,
|
|
188
|
+
format: Optional[str] = None,
|
|
189
|
+
lang: Optional[str] = None,
|
|
190
|
+
original_value: Optional[str] = None,
|
|
191
|
+
origin: Optional[str] = None,
|
|
192
|
+
) -> None:
|
|
193
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
194
|
+
if prop_name is None:
|
|
195
|
+
return
|
|
196
|
+
self._statements.pop(prop_name, None)
|
|
197
|
+
return self.add(
|
|
198
|
+
prop,
|
|
199
|
+
values,
|
|
200
|
+
cleaned=cleaned,
|
|
201
|
+
quiet=quiet,
|
|
202
|
+
fuzzy=fuzzy,
|
|
203
|
+
format=format,
|
|
204
|
+
lang=lang,
|
|
205
|
+
original_value=original_value,
|
|
206
|
+
origin=origin,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def add(
|
|
210
|
+
self,
|
|
211
|
+
prop: P,
|
|
212
|
+
values: Values,
|
|
213
|
+
cleaned: bool = False,
|
|
214
|
+
quiet: bool = False,
|
|
215
|
+
fuzzy: bool = False,
|
|
216
|
+
format: Optional[str] = None,
|
|
217
|
+
lang: Optional[str] = None,
|
|
218
|
+
original_value: Optional[str] = None,
|
|
219
|
+
origin: Optional[str] = None,
|
|
220
|
+
) -> None:
|
|
221
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
222
|
+
if prop_name is None:
|
|
223
|
+
return None
|
|
224
|
+
prop = self.schema.properties[prop_name]
|
|
225
|
+
for value in string_list(values, sanitize=not cleaned):
|
|
226
|
+
self.unsafe_add(
|
|
227
|
+
prop,
|
|
228
|
+
value,
|
|
229
|
+
cleaned=cleaned,
|
|
230
|
+
fuzzy=fuzzy,
|
|
231
|
+
format=format,
|
|
232
|
+
quiet=quiet,
|
|
233
|
+
lang=lang,
|
|
234
|
+
original_value=original_value,
|
|
235
|
+
origin=origin,
|
|
236
|
+
)
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
def unsafe_add(
|
|
240
|
+
self,
|
|
241
|
+
prop: Property,
|
|
242
|
+
value: Optional[str],
|
|
243
|
+
cleaned: bool = False,
|
|
244
|
+
fuzzy: bool = False,
|
|
245
|
+
format: Optional[str] = None,
|
|
246
|
+
quiet: bool = False,
|
|
247
|
+
schema: Optional[str] = None,
|
|
248
|
+
dataset: Optional[str] = None,
|
|
249
|
+
seen: Optional[str] = None,
|
|
250
|
+
lang: Optional[str] = None,
|
|
251
|
+
original_value: Optional[str] = None,
|
|
252
|
+
origin: Optional[str] = None,
|
|
253
|
+
) -> Optional[str]:
|
|
254
|
+
"""Add a statement to the entity, possibly the value."""
|
|
255
|
+
if value is None or len(value) == 0:
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
# Don't allow setting the reverse properties:
|
|
259
|
+
if prop.stub:
|
|
260
|
+
if quiet:
|
|
261
|
+
return None
|
|
262
|
+
msg = gettext("Stub property (%s): %s")
|
|
263
|
+
raise InvalidData(msg % (self.schema, prop))
|
|
264
|
+
|
|
265
|
+
if lang is not None:
|
|
266
|
+
lang = registry.language.clean_text(lang)
|
|
267
|
+
|
|
268
|
+
clean: Optional[str] = value
|
|
269
|
+
if not cleaned:
|
|
270
|
+
clean = prop.type.clean_text(value, proxy=self, fuzzy=fuzzy, format=format)
|
|
271
|
+
|
|
272
|
+
if clean is None:
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
if original_value is None and clean != value:
|
|
276
|
+
original_value = value
|
|
277
|
+
|
|
278
|
+
if self.id is None:
|
|
279
|
+
raise InvalidData("Cannot add statement to entity without ID!")
|
|
280
|
+
stmt = Statement(
|
|
281
|
+
entity_id=self.id,
|
|
282
|
+
prop=prop.name,
|
|
283
|
+
schema=schema or self.schema.name,
|
|
284
|
+
value=clean,
|
|
285
|
+
dataset=dataset or self.dataset.name,
|
|
286
|
+
lang=lang,
|
|
287
|
+
original_value=original_value,
|
|
288
|
+
first_seen=seen,
|
|
289
|
+
origin=origin,
|
|
290
|
+
)
|
|
291
|
+
self.add_statement(stmt)
|
|
292
|
+
return clean
|
|
293
|
+
|
|
294
|
+
def pop(self, prop: P, quiet: bool = True) -> List[str]:
|
|
295
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
296
|
+
if prop_name is None or prop_name not in self._statements:
|
|
297
|
+
return []
|
|
298
|
+
return list({s.value for s in self._statements.pop(prop_name, [])})
|
|
299
|
+
|
|
300
|
+
def remove(self, prop: P, value: str, quiet: bool = True) -> None:
|
|
301
|
+
prop_name = self._prop_name(prop, quiet=quiet)
|
|
302
|
+
if prop_name is not None and prop_name in self._properties:
|
|
303
|
+
stmts = {s for s in self._statements[prop_name] if s.value != value}
|
|
304
|
+
self._statements[prop_name] = stmts
|
|
305
|
+
|
|
306
|
+
def itervalues(self) -> Generator[Tuple[Property, str], None, None]:
|
|
307
|
+
for name, statements in self._statements.items():
|
|
308
|
+
prop = self.schema.properties[name]
|
|
309
|
+
for value in set((s.value for s in statements)):
|
|
310
|
+
yield (prop, value)
|
|
311
|
+
|
|
312
|
+
def get_type_values(
|
|
313
|
+
self, type_: PropertyType, matchable: bool = False
|
|
314
|
+
) -> List[str]:
|
|
315
|
+
combined: Set[str] = set()
|
|
316
|
+
for stmt in self.get_type_statements(type_, matchable=matchable):
|
|
317
|
+
combined.add(stmt.value)
|
|
318
|
+
return list(combined)
|
|
319
|
+
|
|
320
|
+
def get_type_statements(
|
|
321
|
+
self, type_: PropertyType, matchable: bool = False
|
|
322
|
+
) -> List[Statement]:
|
|
323
|
+
combined = []
|
|
324
|
+
for prop_name, statements in self._statements.items():
|
|
325
|
+
prop = self.schema.properties[prop_name]
|
|
326
|
+
if matchable and not prop.matchable:
|
|
327
|
+
continue
|
|
328
|
+
if prop.type == type_:
|
|
329
|
+
for statement in statements:
|
|
330
|
+
combined.append(statement)
|
|
331
|
+
return combined
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def properties(self) -> Dict[str, List[str]]:
|
|
335
|
+
return {p: list({s.value for s in vs}) for p, vs in self._statements.items()}
|
|
336
|
+
|
|
337
|
+
def iterprops(self) -> List[Property]:
|
|
338
|
+
return [self.schema.properties[p] for p in self._statements.keys()]
|
|
339
|
+
|
|
340
|
+
def clone(self: SE) -> SE:
|
|
341
|
+
data = {"schema": self.schema.name, "id": self.id}
|
|
342
|
+
cloned = type(self)(self.dataset, data)
|
|
343
|
+
for stmt in self._iter_stmt():
|
|
344
|
+
cloned.add_statement(stmt)
|
|
345
|
+
return cloned
|
|
346
|
+
|
|
347
|
+
def merge(self: SE, other: EntityProxy) -> SE:
|
|
348
|
+
try:
|
|
349
|
+
self.schema = self.schema.model.common_schema(self.schema, other.schema)
|
|
350
|
+
except InvalidData as e:
|
|
351
|
+
msg = "Cannot merge entities with id %s: %s"
|
|
352
|
+
raise InvalidData(msg % (self.id, e))
|
|
353
|
+
|
|
354
|
+
if not isinstance(other, StatementEntity):
|
|
355
|
+
for prop, values in other._properties.items():
|
|
356
|
+
self.add(prop, values, cleaned=True, quiet=True)
|
|
357
|
+
return self
|
|
358
|
+
for stmt in other._iter_stmt():
|
|
359
|
+
if self.id is not None:
|
|
360
|
+
stmt.canonical_id = self.id
|
|
361
|
+
self.add_statement(stmt)
|
|
362
|
+
self.extra_referents.update(other.extra_referents)
|
|
363
|
+
return self
|
|
364
|
+
|
|
365
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
366
|
+
data: Dict[str, Any] = {
|
|
367
|
+
"id": self.id,
|
|
368
|
+
"caption": self.caption,
|
|
369
|
+
"schema": self.schema.name,
|
|
370
|
+
"properties": self.properties,
|
|
371
|
+
"referents": list(self.referents),
|
|
372
|
+
"datasets": list(self.datasets),
|
|
373
|
+
}
|
|
374
|
+
if self.first_seen is not None:
|
|
375
|
+
data["first_seen"] = self.first_seen
|
|
376
|
+
if self.last_seen is not None:
|
|
377
|
+
data["last_seen"] = self.last_seen
|
|
378
|
+
if self.last_change is not None:
|
|
379
|
+
data["last_change"] = self.last_change
|
|
380
|
+
return data
|
|
381
|
+
|
|
382
|
+
def to_statement_dict(self) -> Dict[str, Any]:
|
|
383
|
+
"""Return a dictionary representation of the entity's statements."""
|
|
384
|
+
data: Dict[str, Any] = {
|
|
385
|
+
"id": self.id,
|
|
386
|
+
"caption": self.caption,
|
|
387
|
+
"schema": self.schema.name,
|
|
388
|
+
"statements": [stmt.to_dict() for stmt in self.statements],
|
|
389
|
+
"referents": list(self.referents),
|
|
390
|
+
"datasets": list(self.datasets),
|
|
391
|
+
}
|
|
392
|
+
if self.first_seen is not None:
|
|
393
|
+
data["first_seen"] = self.first_seen
|
|
394
|
+
if self.last_seen is not None:
|
|
395
|
+
data["last_seen"] = self.last_seen
|
|
396
|
+
if self.last_change is not None:
|
|
397
|
+
data["last_change"] = self.last_change
|
|
398
|
+
return data
|
|
399
|
+
|
|
400
|
+
def __len__(self) -> int:
|
|
401
|
+
return len(list(self._iter_stmt())) + 1
|
|
402
|
+
|
|
403
|
+
@classmethod
|
|
404
|
+
def from_dict(
|
|
405
|
+
cls: Type[SE],
|
|
406
|
+
data: Dict[str, Any],
|
|
407
|
+
cleaned: bool = True,
|
|
408
|
+
default_dataset: Optional[Dataset] = None,
|
|
409
|
+
) -> SE:
|
|
410
|
+
# Exists only for backwards compatibility.
|
|
411
|
+
dataset = default_dataset or DefaultDataset
|
|
412
|
+
return cls(dataset, data, cleaned=cleaned)
|
|
413
|
+
|
|
414
|
+
@classmethod
|
|
415
|
+
def from_data(
|
|
416
|
+
cls: Type[SE],
|
|
417
|
+
dataset: Dataset,
|
|
418
|
+
data: Dict[str, Any],
|
|
419
|
+
cleaned: bool = True,
|
|
420
|
+
) -> SE:
|
|
421
|
+
return cls(dataset, data, cleaned=cleaned)
|
|
422
|
+
|
|
423
|
+
@classmethod
|
|
424
|
+
def from_statements(
|
|
425
|
+
cls: Type[SE],
|
|
426
|
+
dataset: Dataset,
|
|
427
|
+
statements: Iterable[Statement],
|
|
428
|
+
) -> SE:
|
|
429
|
+
obj: Optional[SE] = None
|
|
430
|
+
for stmt in statements:
|
|
431
|
+
if obj is None:
|
|
432
|
+
data = {"schema": stmt.schema, "id": stmt.canonical_id}
|
|
433
|
+
obj = cls(dataset, data)
|
|
434
|
+
obj.add_statement(stmt)
|
|
435
|
+
if obj is None:
|
|
436
|
+
raise ValueError("No statements given!")
|
|
437
|
+
return obj
|