followthemoney 3.8.4__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/__init__.py +3 -12
- followthemoney/cli/aggregate.py +1 -1
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/mapping.py +6 -4
- followthemoney/cli/sieve.py +1 -1
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +137 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +58 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/common.py +3 -3
- followthemoney/export/csv.py +10 -12
- followthemoney/export/neo4j.py +1 -1
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +6 -4
- followthemoney/mapping/csv.py +6 -18
- followthemoney/mapping/sql.py +3 -4
- followthemoney/model.py +36 -9
- followthemoney/namespace.py +3 -1
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +44 -65
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +27 -39
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +437 -0
- followthemoney/statement/serialize.py +245 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/METADATA +38 -34
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/RECORD +78 -69
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/entry_points.txt +1 -0
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/licenses/LICENSE +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.4.dist-info → followthemoney-4.0.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from normality import slugify
|
|
2
|
+
from typing import Annotated, Any
|
|
3
|
+
from pydantic import BeforeValidator
|
|
4
|
+
|
|
5
|
+
from followthemoney.types import registry
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def dataset_name_check(value: str) -> str:
|
|
9
|
+
"""Check that the given value is a valid dataset name. This doesn't convert
|
|
10
|
+
or clean invalid names, but raises an error if they are not compliant to
|
|
11
|
+
force the user to fix an invalid name"""
|
|
12
|
+
if slugify(value, sep="_") != value:
|
|
13
|
+
raise ValueError("Invalid %s: %r" % ("dataset name", value))
|
|
14
|
+
return value
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def type_check_date(value: Any) -> str:
|
|
18
|
+
"""Check that the given value is a valid date string."""
|
|
19
|
+
cleaned = registry.date.clean(value)
|
|
20
|
+
if cleaned is None:
|
|
21
|
+
raise ValueError("Invalid date: %r" % value)
|
|
22
|
+
return cleaned
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
PartialDate = Annotated[str, BeforeValidator(type_check_date)]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def type_check_country(value: Any) -> str:
|
|
29
|
+
"""Check that the given value is a valid country code."""
|
|
30
|
+
cleaned = registry.country.clean(value)
|
|
31
|
+
if cleaned is None:
|
|
32
|
+
raise ValueError("Invalid country code: %r" % value)
|
|
33
|
+
return cleaned
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
CountryCode = Annotated[str, BeforeValidator(type_check_country)]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Named:
|
|
40
|
+
name: str
|
|
41
|
+
|
|
42
|
+
def __init__(self, name: str) -> None:
|
|
43
|
+
self.name = name
|
|
44
|
+
|
|
45
|
+
def __eq__(self, other: Any) -> bool:
|
|
46
|
+
try:
|
|
47
|
+
return not not self.name == other.name
|
|
48
|
+
except AttributeError:
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
def __lt__(self, other: Any) -> bool:
|
|
52
|
+
return self.name.__lt__(other.name)
|
|
53
|
+
|
|
54
|
+
def __hash__(self) -> int:
|
|
55
|
+
return hash(self.name)
|
|
56
|
+
|
|
57
|
+
def __repr__(self) -> str:
|
|
58
|
+
return f"<{self.__class__.__name__}({self.name!r})>"
|
followthemoney/entity.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, Set, TypeVar
|
|
2
|
+
|
|
3
|
+
from rigour.names import pick_name
|
|
4
|
+
|
|
5
|
+
from followthemoney.proxy import EntityProxy
|
|
6
|
+
from followthemoney.schema import Schema
|
|
7
|
+
from followthemoney.statement.util import BASE_ID
|
|
8
|
+
|
|
9
|
+
VE = TypeVar("VE", bound="ValueEntity")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _defined(*args: Optional[str]) -> List[str]:
|
|
13
|
+
return [arg for arg in args if arg is not None]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ValueEntity(EntityProxy):
|
|
17
|
+
"""
|
|
18
|
+
This class has the extended attributes from `StatementEntity` but without
|
|
19
|
+
statements. Useful for streaming around. Starting from followthemoeny 4.0,
|
|
20
|
+
applications should use this entity class as the base class.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
schema: Schema,
|
|
26
|
+
data: Dict[str, Any],
|
|
27
|
+
key_prefix: Optional[str] = None,
|
|
28
|
+
cleaned: bool = True,
|
|
29
|
+
):
|
|
30
|
+
super().__init__(schema, data, key_prefix=key_prefix, cleaned=cleaned)
|
|
31
|
+
self._caption: Optional[str] = data.get("caption")
|
|
32
|
+
self.datasets: Set[str] = set(data.get("datasets", []))
|
|
33
|
+
self.referents: Set[str] = set(data.get("referents", []))
|
|
34
|
+
self.first_seen: Optional[str] = data.get("first_seen")
|
|
35
|
+
self.last_seen: Optional[str] = data.get("last_seen")
|
|
36
|
+
self.last_change: Optional[str] = data.get("last_change")
|
|
37
|
+
|
|
38
|
+
# add data from statement dict if present.
|
|
39
|
+
# this updates the dataset and referents set
|
|
40
|
+
for stmt_data in data.pop("statements", []):
|
|
41
|
+
self.datasets.add(stmt_data["dataset"])
|
|
42
|
+
if stmt_data["entity_id"] != self.id:
|
|
43
|
+
self.referents.add(stmt_data["entity_id"])
|
|
44
|
+
if stmt_data["prop"] != BASE_ID:
|
|
45
|
+
self.add(stmt_data["prop"], stmt_data["value"])
|
|
46
|
+
|
|
47
|
+
def merge(self: "ValueEntity", other: "ValueEntity") -> "ValueEntity":
|
|
48
|
+
merged = super().merge(other)
|
|
49
|
+
merged._caption = pick_name(_defined(self._caption, other._caption))
|
|
50
|
+
merged.referents.update(other.referents)
|
|
51
|
+
merged.datasets.update(other.datasets)
|
|
52
|
+
self.first_seen = min(_defined(self.first_seen, other.first_seen), default=None)
|
|
53
|
+
self.last_seen = max(_defined(self.last_seen, other.last_seen), default=None)
|
|
54
|
+
changed = _defined(self.last_change, other.last_change)
|
|
55
|
+
self.last_change = max(changed, default=None)
|
|
56
|
+
return merged
|
|
57
|
+
|
|
58
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
59
|
+
data: Dict[str, Any] = {
|
|
60
|
+
"id": self.id,
|
|
61
|
+
"caption": self._caption or self.caption,
|
|
62
|
+
"schema": self.schema.name,
|
|
63
|
+
"properties": self.properties,
|
|
64
|
+
"referents": list(self.referents),
|
|
65
|
+
"datasets": list(self.datasets),
|
|
66
|
+
}
|
|
67
|
+
if self.first_seen is not None:
|
|
68
|
+
data["first_seen"] = self.first_seen
|
|
69
|
+
if self.last_seen is not None:
|
|
70
|
+
data["last_seen"] = self.last_seen
|
|
71
|
+
if self.last_change is not None:
|
|
72
|
+
data["last_change"] = self.last_change
|
|
73
|
+
return data
|
followthemoney/exc.py
CHANGED
|
@@ -11,6 +11,12 @@ class FollowTheMoneyException(Exception):
|
|
|
11
11
|
pass
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
class MetadataException(FollowTheMoneyException):
|
|
15
|
+
"""An exception raised by dataset metadata validation."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
14
20
|
class InvalidData(FollowTheMoneyException):
|
|
15
21
|
"""Schema validation errors will be caught by the API."""
|
|
16
22
|
|
followthemoney/export/common.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Generator, List, Optional, Tuple
|
|
2
2
|
from followthemoney.property import Property
|
|
3
|
-
from followthemoney.proxy import
|
|
3
|
+
from followthemoney.proxy import EntityProxy
|
|
4
4
|
from followthemoney.schema import Schema
|
|
5
5
|
from followthemoney.types import registry
|
|
6
6
|
|
|
@@ -17,12 +17,12 @@ class Exporter(object):
|
|
|
17
17
|
yield prop
|
|
18
18
|
|
|
19
19
|
def exportable_fields(
|
|
20
|
-
self, proxy:
|
|
20
|
+
self, proxy: EntityProxy
|
|
21
21
|
) -> Generator[Tuple[Property, List[str]], None, None]:
|
|
22
22
|
for prop in self.exportable_properties(proxy.schema):
|
|
23
23
|
yield prop, proxy.get(prop)
|
|
24
24
|
|
|
25
|
-
def write(self, proxy:
|
|
25
|
+
def write(self, proxy: EntityProxy, extra: Optional[List[str]] = None) -> None:
|
|
26
26
|
raise NotImplementedError
|
|
27
27
|
|
|
28
28
|
def finalize(self) -> None:
|
followthemoney/export/csv.py
CHANGED
|
@@ -1,21 +1,19 @@
|
|
|
1
1
|
import csv
|
|
2
|
-
|
|
3
|
-
try:
|
|
4
|
-
from _csv import _writer as csv_writer
|
|
5
|
-
except ImportError:
|
|
6
|
-
# Python 3.8/3.9 work-around:
|
|
7
|
-
from _csv import writer as csv_writer # type: ignore
|
|
8
|
-
|
|
9
|
-
from io import TextIOWrapper
|
|
10
2
|
from pathlib import Path
|
|
11
|
-
from
|
|
3
|
+
from io import TextIOWrapper
|
|
4
|
+
from typing import Any, Dict, List, Optional, Protocol, Tuple
|
|
12
5
|
|
|
13
|
-
from followthemoney.proxy import
|
|
6
|
+
from followthemoney.proxy import EntityProxy
|
|
14
7
|
from followthemoney.export.common import Exporter
|
|
15
8
|
from followthemoney.schema import Schema
|
|
16
9
|
from followthemoney.util import PathLike
|
|
17
10
|
|
|
18
|
-
|
|
11
|
+
|
|
12
|
+
class CSVWriter(Protocol):
|
|
13
|
+
@property
|
|
14
|
+
def dialect(self) -> Any: ...
|
|
15
|
+
def writerow(self, row: Any) -> Any: ...
|
|
16
|
+
def writerows(self, rows: Any) -> None: ...
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class CSVMixin(object):
|
|
@@ -69,7 +67,7 @@ class CSVExporter(Exporter, CSVMixin):
|
|
|
69
67
|
headers.append(prop.name)
|
|
70
68
|
writer.writerow(headers)
|
|
71
69
|
|
|
72
|
-
def write(self, proxy:
|
|
70
|
+
def write(self, proxy: EntityProxy, extra: Optional[List[str]] = None) -> None:
|
|
73
71
|
writer = self._get_writer(proxy.schema)
|
|
74
72
|
cells = [proxy.id]
|
|
75
73
|
cells.extend(extra or [])
|
followthemoney/export/neo4j.py
CHANGED
|
@@ -150,7 +150,7 @@ class CypherGraphExporter(GraphExporter):
|
|
|
150
150
|
labels = list(node.schema.names)
|
|
151
151
|
else:
|
|
152
152
|
labels = [node.type.name]
|
|
153
|
-
cypher = "MERGE (p { %(id)s })
|
|
153
|
+
cypher = "MERGE (p { %(id)s }) SET p += { %(map)s } SET p :%(label)s;\n"
|
|
154
154
|
self.fh.write(
|
|
155
155
|
cypher
|
|
156
156
|
% {
|
followthemoney/export/rdf.py
CHANGED
|
@@ -1,23 +1,75 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from
|
|
2
|
+
from prefixdate import Precision
|
|
3
|
+
from rdflib import Graph, Namespace
|
|
4
|
+
from rdflib.term import Identifier, URIRef, Literal
|
|
5
|
+
from rdflib import RDF, SKOS, XSD
|
|
6
|
+
from typing import Generator, List, Optional, TextIO, Tuple
|
|
4
7
|
|
|
5
8
|
from followthemoney.export.common import Exporter
|
|
6
|
-
from followthemoney.
|
|
9
|
+
from followthemoney.types import registry
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
7
11
|
|
|
8
12
|
log = logging.getLogger(__name__)
|
|
13
|
+
Triple = Tuple[Identifier, Identifier, Identifier]
|
|
14
|
+
NS = Namespace("https://schema.followthemoney.tech/#")
|
|
9
15
|
|
|
10
16
|
|
|
11
17
|
class RDFExporter(Exporter):
|
|
18
|
+
"""Export the entity as RDF N-Triples."""
|
|
19
|
+
|
|
20
|
+
TYPE_PREFIXES = {
|
|
21
|
+
registry.checksum: "hash:",
|
|
22
|
+
registry.country: "http://id.loc.gov/vocabulary/countries/",
|
|
23
|
+
registry.email: "mailto:",
|
|
24
|
+
registry.entity: "e:",
|
|
25
|
+
registry.gender: "gender:",
|
|
26
|
+
registry.ip: "ip:",
|
|
27
|
+
registry.identifier: "id:",
|
|
28
|
+
registry.language: "http://lexvo.org/id/iso639-3/",
|
|
29
|
+
registry.mimetype: "urn:mimetype:",
|
|
30
|
+
registry.phone: "tel:",
|
|
31
|
+
registry.topic: "ftm:topic:",
|
|
32
|
+
}
|
|
33
|
+
|
|
12
34
|
def __init__(self, fh: TextIO, qualified: bool = True) -> None:
|
|
13
35
|
super(RDFExporter, self).__init__()
|
|
14
36
|
self.fh = fh
|
|
15
37
|
self.qualified = qualified
|
|
16
38
|
|
|
17
|
-
def
|
|
39
|
+
def entity_triples(self, proxy: EntityProxy) -> Generator[Triple, None, None]:
|
|
40
|
+
if proxy.id is None or proxy.schema is None:
|
|
41
|
+
return
|
|
42
|
+
entity_prefix = self.TYPE_PREFIXES[registry.entity]
|
|
43
|
+
uri = URIRef(f"{entity_prefix}{proxy.id}")
|
|
44
|
+
yield (uri, RDF.type, NS[proxy.schema.name])
|
|
45
|
+
if self.qualified:
|
|
46
|
+
caption = proxy.caption
|
|
47
|
+
if caption != proxy.schema.label:
|
|
48
|
+
yield (uri, SKOS.prefLabel, Literal(caption))
|
|
49
|
+
for prop, value in proxy.itervalues():
|
|
50
|
+
if prop.type in self.TYPE_PREFIXES:
|
|
51
|
+
prefix = self.TYPE_PREFIXES[prop.type]
|
|
52
|
+
if prop.type == registry.identifier and prop.format is not None:
|
|
53
|
+
prefix = f"{prefix}{prop.format}:"
|
|
54
|
+
obj: Identifier = URIRef(f"{prefix}{value}")
|
|
55
|
+
elif prop.type == registry.date:
|
|
56
|
+
if len(value) < Precision.HOUR.value:
|
|
57
|
+
obj = Literal(value, datatype=XSD.date)
|
|
58
|
+
else:
|
|
59
|
+
obj = Literal(value, datatype=XSD.dateTime)
|
|
60
|
+
elif prop.type == registry.url:
|
|
61
|
+
obj = URIRef(value)
|
|
62
|
+
else:
|
|
63
|
+
obj = Literal(value)
|
|
64
|
+
if self.qualified:
|
|
65
|
+
yield (uri, NS[prop.qname], obj)
|
|
66
|
+
else:
|
|
67
|
+
yield (uri, URIRef(prop.name), obj)
|
|
68
|
+
|
|
69
|
+
def write(self, proxy: EntityProxy, extra: Optional[List[str]] = None) -> None:
|
|
18
70
|
graph = Graph()
|
|
19
71
|
|
|
20
|
-
for triple in
|
|
72
|
+
for triple in self.entity_triples(proxy):
|
|
21
73
|
graph.add(triple)
|
|
22
74
|
try:
|
|
23
75
|
nt = graph.serialize(format="nt11").strip()
|
followthemoney/graph.py
CHANGED
|
@@ -5,6 +5,7 @@ This module provides an abstract data object that represents a property
|
|
|
5
5
|
graph. This is used by the exporter modules to convert data
|
|
6
6
|
to a specific output format, like Cypher or NetworkX.
|
|
7
7
|
"""
|
|
8
|
+
|
|
8
9
|
import logging
|
|
9
10
|
from typing import Any, Dict, Generator, Iterable, List, Optional
|
|
10
11
|
|
|
@@ -69,6 +70,8 @@ class Node(object):
|
|
|
69
70
|
def from_proxy(cls, proxy: EntityProxy) -> "Node":
|
|
70
71
|
"""For a given :class:`~followthemoney.proxy.EntityProxy`, return a node
|
|
71
72
|
based on the entity."""
|
|
73
|
+
if proxy.id is None:
|
|
74
|
+
raise InvalidModel("Invalid entity proxy: %r" % proxy)
|
|
72
75
|
return cls(registry.entity, proxy.id, proxy=proxy)
|
|
73
76
|
|
|
74
77
|
def __str__(self) -> str:
|
|
@@ -193,8 +196,7 @@ class Graph(object):
|
|
|
193
196
|
"""
|
|
194
197
|
|
|
195
198
|
def __init__(self, edge_types: Iterable[PropertyType] = registry.pivots) -> None:
|
|
196
|
-
|
|
197
|
-
self.edge_types = [t for t in types if t.matchable]
|
|
199
|
+
self.edge_types = [t for t in edge_types if t.matchable]
|
|
198
200
|
self.flush()
|
|
199
201
|
|
|
200
202
|
def flush(self) -> None:
|
|
@@ -256,11 +258,11 @@ class Graph(object):
|
|
|
256
258
|
"""Add an :class:`~followthemoney.proxy.EntityProxy` to the graph and make
|
|
257
259
|
it either a :class:`~followthemoney.graph.Node` or an
|
|
258
260
|
:class:`~followthemoney.graph.Edge`."""
|
|
259
|
-
if proxy is None:
|
|
261
|
+
if proxy is None or proxy.id is None:
|
|
260
262
|
return
|
|
261
263
|
self.queue(proxy.id, proxy)
|
|
262
264
|
if proxy.schema.edge:
|
|
263
|
-
for
|
|
265
|
+
for source, target in proxy.edgepairs():
|
|
264
266
|
self._add_edge(proxy, source, target)
|
|
265
267
|
else:
|
|
266
268
|
self._add_node(proxy)
|
followthemoney/mapping/csv.py
CHANGED
|
@@ -1,24 +1,12 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
import logging
|
|
4
|
-
from banal.lists import ensure_list
|
|
5
4
|
import requests
|
|
6
5
|
from csv import DictReader
|
|
7
6
|
from urllib.parse import urlparse
|
|
8
|
-
from banal import keys_values
|
|
9
|
-
from typing import
|
|
10
|
-
|
|
11
|
-
Any,
|
|
12
|
-
Dict,
|
|
13
|
-
Generator,
|
|
14
|
-
ItemsView,
|
|
15
|
-
Iterable,
|
|
16
|
-
List,
|
|
17
|
-
Optional,
|
|
18
|
-
Set,
|
|
19
|
-
Tuple,
|
|
20
|
-
cast,
|
|
21
|
-
)
|
|
7
|
+
from banal import keys_values, ensure_list
|
|
8
|
+
from typing import TYPE_CHECKING, cast
|
|
9
|
+
from typing import Any, Dict, Generator, ItemsView, Iterable, List, Optional, Set, Tuple
|
|
22
10
|
|
|
23
11
|
from followthemoney.mapping.source import Record, Source
|
|
24
12
|
from followthemoney.util import sanitize_text
|
|
@@ -48,16 +36,16 @@ class CSVSource(Source):
|
|
|
48
36
|
|
|
49
37
|
def _parse_filters(self, filters: ItemsView[str, Any]) -> FilterList:
|
|
50
38
|
filters_set: FilterList = []
|
|
51
|
-
for
|
|
39
|
+
for key, value in filters:
|
|
52
40
|
values = set(cast(List[Optional[str]], ensure_list(value)))
|
|
53
41
|
filters_set.append((key, values))
|
|
54
42
|
return filters_set
|
|
55
43
|
|
|
56
44
|
def check_filters(self, data: Record) -> bool:
|
|
57
|
-
for
|
|
45
|
+
for k, v in self.filters_set:
|
|
58
46
|
if data.get(k) not in v:
|
|
59
47
|
return False
|
|
60
|
-
for
|
|
48
|
+
for k, v in self.filters_not_set:
|
|
61
49
|
if data.get(k) in v:
|
|
62
50
|
return False
|
|
63
51
|
return True
|
followthemoney/mapping/sql.py
CHANGED
|
@@ -3,8 +3,7 @@ import logging
|
|
|
3
3
|
from uuid import uuid4
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union, cast
|
|
5
5
|
from banal import ensure_list, is_listish, keys_values
|
|
6
|
-
from sqlalchemy import MetaData, func
|
|
7
|
-
from sqlalchemy.future import select
|
|
6
|
+
from sqlalchemy import MetaData, func, select
|
|
8
7
|
from sqlalchemy.engine import Engine, create_engine
|
|
9
8
|
from sqlalchemy.sql.elements import Label
|
|
10
9
|
from sqlalchemy.pool import NullPool
|
|
@@ -68,7 +67,7 @@ class SQLSource(Source):
|
|
|
68
67
|
return table.refs[ref]
|
|
69
68
|
raise InvalidMapping("Missing reference: %s" % ref)
|
|
70
69
|
|
|
71
|
-
def apply_filters(self, q: Select) -> Select:
|
|
70
|
+
def apply_filters(self, q: Select[Any]) -> Select[Any]:
|
|
72
71
|
for col, val in self.filters:
|
|
73
72
|
if is_listish(val):
|
|
74
73
|
q = q.where(self.get_column(col).in_(val))
|
|
@@ -88,7 +87,7 @@ class SQLSource(Source):
|
|
|
88
87
|
q = q.where(left == right)
|
|
89
88
|
return q
|
|
90
89
|
|
|
91
|
-
def compose_query(self) -> Select:
|
|
90
|
+
def compose_query(self) -> Select[Any]:
|
|
92
91
|
columns = [self.get_column(r) for r in self.query.refs]
|
|
93
92
|
q = select(*columns)
|
|
94
93
|
q = q.select_from(*[t.alias for t in self.tables])
|
followthemoney/model.py
CHANGED
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import yaml
|
|
3
3
|
from functools import lru_cache
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import Dict, Generator, Iterator, Optional, Set, TypedDict, Union
|
|
5
6
|
|
|
6
7
|
from followthemoney.types import registry
|
|
7
8
|
from followthemoney.types.common import PropertyType, PropertyTypeToDict
|
|
8
9
|
from followthemoney.schema import Schema, SchemaToDict
|
|
9
10
|
from followthemoney.property import Property
|
|
10
|
-
from followthemoney.mapping import QueryMapping
|
|
11
|
-
from followthemoney.proxy import EntityProxy
|
|
12
11
|
from followthemoney.exc import InvalidModel, InvalidData
|
|
13
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from followthemoney.proxy import EntityProxy
|
|
15
|
+
from followthemoney.mapping import QueryMapping
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
class ModelToDict(TypedDict):
|
|
16
19
|
schemata: Dict[str, SchemaToDict]
|
|
@@ -22,6 +25,8 @@ class Model(object):
|
|
|
22
25
|
provides some helper functions to find schemata, properties or to instantiate
|
|
23
26
|
entity proxies based on the schema metadata."""
|
|
24
27
|
|
|
28
|
+
_instance: Optional["Model"] = None
|
|
29
|
+
|
|
25
30
|
__slots__ = ("path", "schemata", "properties", "qnames")
|
|
26
31
|
|
|
27
32
|
def __init__(self, path: str) -> None:
|
|
@@ -38,6 +43,15 @@ class Model(object):
|
|
|
38
43
|
self._load(os.path.join(path, filename))
|
|
39
44
|
self.generate()
|
|
40
45
|
|
|
46
|
+
@classmethod
|
|
47
|
+
def instance(cls) -> "Model":
|
|
48
|
+
if cls._instance is None:
|
|
49
|
+
model_path = os.path.dirname(__file__)
|
|
50
|
+
model_path = os.path.join(model_path, "schema")
|
|
51
|
+
model_path = os.environ.get("FTM_MODEL_PATH", model_path)
|
|
52
|
+
cls._instance = cls(model_path)
|
|
53
|
+
return cls._instance
|
|
54
|
+
|
|
41
55
|
def generate(self) -> None:
|
|
42
56
|
"""Loading the model is a weird process because the schemata reference
|
|
43
57
|
each other in complex ways, so the generation process cannot be fully
|
|
@@ -89,13 +103,15 @@ class Model(object):
|
|
|
89
103
|
|
|
90
104
|
def make_mapping(
|
|
91
105
|
self, mapping: Dict[str, Any], key_prefix: Optional[str] = None
|
|
92
|
-
) -> QueryMapping:
|
|
106
|
+
) -> "QueryMapping":
|
|
93
107
|
"""Parse a mapping that applies (tabular) source data to the model."""
|
|
108
|
+
from followthemoney.mapping import QueryMapping
|
|
109
|
+
|
|
94
110
|
return QueryMapping(self, mapping, key_prefix=key_prefix)
|
|
95
111
|
|
|
96
112
|
def map_entities(
|
|
97
113
|
self, mapping: Dict[str, Any], key_prefix: Optional[str] = None
|
|
98
|
-
) -> Generator[EntityProxy, None, None]:
|
|
114
|
+
) -> Generator["EntityProxy", None, None]:
|
|
99
115
|
"""Given a mapping, yield a series of entities from the data source."""
|
|
100
116
|
gen = self.make_mapping(mapping, key_prefix=key_prefix)
|
|
101
117
|
for record in gen.source.records:
|
|
@@ -127,20 +143,31 @@ class Model(object):
|
|
|
127
143
|
msg = "No common schema: %s and %s"
|
|
128
144
|
raise InvalidData(msg % (left, right))
|
|
129
145
|
|
|
146
|
+
def matchable_schemata(self) -> Set[Schema]:
|
|
147
|
+
"""Return a list of all schemata that are matchable."""
|
|
148
|
+
return set([s for s in self.schemata.values() if s.matchable])
|
|
149
|
+
|
|
130
150
|
def make_entity(
|
|
131
151
|
self, schema: Union[str, Schema], key_prefix: Optional[str] = None
|
|
132
|
-
) -> EntityProxy:
|
|
152
|
+
) -> "EntityProxy":
|
|
133
153
|
"""Instantiate an empty entity proxy of the given schema type."""
|
|
134
|
-
|
|
154
|
+
from followthemoney.proxy import EntityProxy
|
|
155
|
+
|
|
156
|
+
schema_ = self.get(schema)
|
|
157
|
+
if schema_ is None:
|
|
158
|
+
raise InvalidData("Schema does not exist: %s" % schema)
|
|
159
|
+
return EntityProxy(schema_, {}, key_prefix=key_prefix)
|
|
135
160
|
|
|
136
|
-
def get_proxy(self, data: Dict[str, Any], cleaned: bool = True) -> EntityProxy:
|
|
161
|
+
def get_proxy(self, data: Dict[str, Any], cleaned: bool = True) -> "EntityProxy":
|
|
137
162
|
"""Create an entity proxy to reflect the entity data in the given
|
|
138
163
|
dictionary. If ``cleaned`` is disabled, all property values are
|
|
139
164
|
fully re-validated and normalised. Use this if handling input data
|
|
140
165
|
from an untrusted source."""
|
|
166
|
+
from followthemoney.proxy import EntityProxy
|
|
167
|
+
|
|
141
168
|
if isinstance(data, EntityProxy):
|
|
142
169
|
return data
|
|
143
|
-
return EntityProxy.from_dict(
|
|
170
|
+
return EntityProxy.from_dict(data, cleaned=cleaned)
|
|
144
171
|
|
|
145
172
|
def to_dict(self) -> ModelToDict:
|
|
146
173
|
"""Return metadata for all schemata and properties, in a serializable form."""
|
followthemoney/namespace.py
CHANGED
|
@@ -22,6 +22,7 @@ that the combined ID is specific to a dataset, without needing an (expensive)
|
|
|
22
22
|
index look up of each ID first. It can also be generated on the client or
|
|
23
23
|
the server without compromising isolation.
|
|
24
24
|
"""
|
|
25
|
+
|
|
25
26
|
import hmac
|
|
26
27
|
from typing import Any, Optional, Tuple, Union
|
|
27
28
|
|
|
@@ -95,7 +96,8 @@ class Namespace(object):
|
|
|
95
96
|
"""Rewrite an entity proxy so all IDs mentioned are limited to
|
|
96
97
|
the namespace."""
|
|
97
98
|
signed = proxy.clone()
|
|
98
|
-
|
|
99
|
+
if proxy.id is not None:
|
|
100
|
+
signed.id = self.sign(proxy.id)
|
|
99
101
|
if not shallow:
|
|
100
102
|
for prop in proxy.iterprops():
|
|
101
103
|
if prop.type != registry.entity:
|
followthemoney/ontology.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from rdflib import Graph, URIRef, Literal
|
|
3
|
+
from rdflib import Graph, URIRef, Literal, Namespace
|
|
4
4
|
from rdflib.namespace import OWL, DCTERMS, RDF, RDFS, XSD
|
|
5
5
|
|
|
6
6
|
from followthemoney import model
|
|
7
7
|
from followthemoney.property import Property
|
|
8
8
|
from followthemoney.schema import Schema
|
|
9
9
|
from followthemoney.types import registry
|
|
10
|
-
from followthemoney.rdf import NS
|
|
11
10
|
from followthemoney.util import PathLike
|
|
12
11
|
|
|
12
|
+
NS = Namespace("https://schema.followthemoney.tech/#")
|
|
13
|
+
|
|
13
14
|
|
|
14
15
|
class Ontology(object):
|
|
15
16
|
def __init__(self) -> None:
|
|
@@ -32,37 +33,38 @@ class Ontology(object):
|
|
|
32
33
|
self.add_class(schema)
|
|
33
34
|
|
|
34
35
|
def add_class(self, schema: Schema) -> None:
|
|
35
|
-
|
|
36
|
-
self.graph.add((
|
|
36
|
+
suri = NS[schema.name]
|
|
37
|
+
self.graph.add((suri, RDF.type, RDFS.Class))
|
|
38
|
+
self.graph.add((suri, RDFS.isDefinedBy, self.uri))
|
|
37
39
|
for parent in schema.extends:
|
|
38
|
-
self.graph.add((
|
|
40
|
+
self.graph.add((suri, RDFS.subClassOf, NS[parent.name]))
|
|
39
41
|
|
|
40
|
-
self.graph.add((
|
|
42
|
+
self.graph.add((suri, RDFS.label, Literal(schema.label)))
|
|
41
43
|
if schema.description is not None:
|
|
42
44
|
description = Literal(schema.description)
|
|
43
|
-
self.graph.add((
|
|
45
|
+
self.graph.add((suri, RDFS.comment, description))
|
|
44
46
|
|
|
45
47
|
for _, prop in sorted(schema.properties.items()):
|
|
46
48
|
self.add_property(prop)
|
|
47
49
|
|
|
48
50
|
def add_property(self, prop: Property) -> None:
|
|
49
|
-
|
|
50
|
-
self.graph.add((
|
|
51
|
+
puri = NS[prop.qname]
|
|
52
|
+
self.graph.add((puri, RDF.type, RDF.Property))
|
|
53
|
+
self.graph.add((puri, RDFS.isDefinedBy, self.uri))
|
|
51
54
|
|
|
52
|
-
self.graph.add((
|
|
55
|
+
self.graph.add((puri, RDFS.label, Literal(prop.label)))
|
|
53
56
|
if prop.description is not None:
|
|
54
|
-
self.graph.add((
|
|
57
|
+
self.graph.add((puri, RDFS.comment, Literal(prop.description)))
|
|
55
58
|
|
|
56
|
-
self.graph.add((
|
|
59
|
+
self.graph.add((puri, RDFS.domain, NS[prop.schema.name]))
|
|
57
60
|
if prop.range is not None:
|
|
58
61
|
range = model.get(prop.range)
|
|
59
62
|
if range is not None:
|
|
60
|
-
|
|
61
|
-
self.graph.add((prop.uri, RDFS.range, range_uri))
|
|
63
|
+
self.graph.add((puri, RDFS.range, NS[range.name]))
|
|
62
64
|
if prop.reverse is not None:
|
|
63
|
-
self.graph.add((
|
|
65
|
+
self.graph.add((puri, OWL.inverseOf, NS[prop.reverse.qname]))
|
|
64
66
|
if prop.type == registry.date:
|
|
65
|
-
self.graph.add((
|
|
67
|
+
self.graph.add((puri, RDFS.range, XSD.dateTime))
|
|
66
68
|
|
|
67
69
|
def write_namespace_docs(self, path: PathLike) -> None:
|
|
68
70
|
xml_fn = "%s/ftm.xml" % path
|