followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +30 -10
- followthemoney/cli/cli.py +1 -1
- followthemoney/cli/exports.py +6 -2
- followthemoney/cli/statement.py +62 -0
- followthemoney/cli/util.py +2 -3
- followthemoney/compare.py +26 -16
- followthemoney/dataset/__init__.py +17 -0
- followthemoney/dataset/catalog.py +77 -0
- followthemoney/dataset/coverage.py +29 -0
- followthemoney/dataset/dataset.py +146 -0
- followthemoney/dataset/publisher.py +25 -0
- followthemoney/dataset/resource.py +30 -0
- followthemoney/dataset/util.py +55 -0
- followthemoney/entity.py +73 -0
- followthemoney/exc.py +6 -0
- followthemoney/export/rdf.py +57 -5
- followthemoney/graph.py +1 -2
- followthemoney/model.py +38 -11
- followthemoney/names.py +33 -0
- followthemoney/ontology.py +18 -16
- followthemoney/property.py +12 -15
- followthemoney/proxy.py +43 -64
- followthemoney/schema/Analyzable.yaml +2 -3
- followthemoney/schema/BankAccount.yaml +2 -3
- followthemoney/schema/Company.yaml +0 -6
- followthemoney/schema/Contract.yaml +0 -1
- followthemoney/schema/CryptoWallet.yaml +1 -1
- followthemoney/schema/Document.yaml +0 -6
- followthemoney/schema/Interval.yaml +7 -0
- followthemoney/schema/LegalEntity.yaml +6 -0
- followthemoney/schema/License.yaml +2 -0
- followthemoney/schema/Page.yaml +0 -1
- followthemoney/schema/Person.yaml +0 -5
- followthemoney/schema/Sanction.yaml +1 -0
- followthemoney/schema/Thing.yaml +0 -2
- followthemoney/schema/UserAccount.yaml +6 -3
- followthemoney/schema.py +30 -42
- followthemoney/statement/__init__.py +19 -0
- followthemoney/statement/entity.py +438 -0
- followthemoney/statement/serialize.py +251 -0
- followthemoney/statement/statement.py +256 -0
- followthemoney/statement/util.py +31 -0
- followthemoney/types/__init__.py +66 -23
- followthemoney/types/address.py +3 -3
- followthemoney/types/checksum.py +3 -7
- followthemoney/types/common.py +9 -14
- followthemoney/types/country.py +3 -7
- followthemoney/types/date.py +21 -11
- followthemoney/types/email.py +0 -4
- followthemoney/types/entity.py +5 -11
- followthemoney/types/gender.py +6 -10
- followthemoney/types/identifier.py +9 -3
- followthemoney/types/ip.py +5 -9
- followthemoney/types/json.py +2 -2
- followthemoney/types/language.py +3 -7
- followthemoney/types/mimetype.py +4 -8
- followthemoney/types/name.py +7 -8
- followthemoney/types/number.py +88 -6
- followthemoney/types/phone.py +4 -11
- followthemoney/types/string.py +4 -4
- followthemoney/types/topic.py +3 -7
- followthemoney/types/url.py +5 -10
- followthemoney/util.py +12 -13
- followthemoney/value.py +67 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
- followthemoney/offshore.py +0 -48
- followthemoney/rdf.py +0 -9
- followthemoney/schema/Assessment.yaml +0 -32
- followthemoney/schema/Post.yaml +0 -42
- followthemoney/types/iban.py +0 -58
- followthemoney/types/registry.py +0 -52
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
- {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
followthemoney/__init__.py
CHANGED
|
@@ -1,16 +1,36 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
from followthemoney.entity import ValueEntity, VE
|
|
3
2
|
from followthemoney.model import Model
|
|
3
|
+
from followthemoney.schema import Schema
|
|
4
|
+
from followthemoney.property import Property
|
|
5
|
+
from followthemoney.types import registry
|
|
6
|
+
from followthemoney.value import Value, Values
|
|
7
|
+
from followthemoney.proxy import EntityProxy, E
|
|
8
|
+
from followthemoney.statement import Statement, StatementEntity, SE
|
|
9
|
+
from followthemoney.dataset import Dataset, DefaultDataset, DS
|
|
4
10
|
from followthemoney.util import set_model_locale
|
|
5
11
|
|
|
6
|
-
__version__ = "
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
model_path = os.path.dirname(__file__)
|
|
10
|
-
model_path = os.path.join(model_path, "schema")
|
|
11
|
-
model_path = os.environ.get("FTM_MODEL_PATH", model_path)
|
|
12
|
+
__version__ = "4.0.1"
|
|
12
13
|
|
|
13
14
|
# Data model singleton
|
|
14
|
-
model = Model(
|
|
15
|
+
model = Model.instance()
|
|
15
16
|
|
|
16
|
-
__all__ = [
|
|
17
|
+
__all__ = [
|
|
18
|
+
"model",
|
|
19
|
+
"set_model_locale",
|
|
20
|
+
"Model",
|
|
21
|
+
"Schema",
|
|
22
|
+
"Property",
|
|
23
|
+
"Value",
|
|
24
|
+
"Values",
|
|
25
|
+
"EntityProxy",
|
|
26
|
+
"E",
|
|
27
|
+
"registry",
|
|
28
|
+
"Dataset",
|
|
29
|
+
"DefaultDataset",
|
|
30
|
+
"DS",
|
|
31
|
+
"Statement",
|
|
32
|
+
"StatementEntity",
|
|
33
|
+
"SE",
|
|
34
|
+
"ValueEntity",
|
|
35
|
+
"VE",
|
|
36
|
+
]
|
followthemoney/cli/cli.py
CHANGED
|
@@ -56,7 +56,7 @@ def import_vis(infile: Path, outfile: Path) -> None:
|
|
|
56
56
|
else:
|
|
57
57
|
raise click.ClickException("No entities found in VIS file")
|
|
58
58
|
for entity_data in ensure_list(entities):
|
|
59
|
-
entity = EntityProxy.from_dict(
|
|
59
|
+
entity = EntityProxy.from_dict(entity_data)
|
|
60
60
|
write_entity(outfh, entity)
|
|
61
61
|
|
|
62
62
|
|
followthemoney/cli/exports.py
CHANGED
|
@@ -6,8 +6,6 @@ from contextlib import contextmanager
|
|
|
6
6
|
from followthemoney.cli.cli import cli
|
|
7
7
|
from followthemoney.cli.util import InPath, OutPath, export_stream
|
|
8
8
|
from followthemoney.export.csv import CSVExporter
|
|
9
|
-
from followthemoney.export.rdf import RDFExporter
|
|
10
|
-
from followthemoney.export.excel import ExcelExporter
|
|
11
9
|
from followthemoney.export.graph import edge_types, DEFAULT_EDGE_TYPES
|
|
12
10
|
from followthemoney.export.graph import NXGraphExporter
|
|
13
11
|
from followthemoney.export.neo4j import Neo4JCSVExporter
|
|
@@ -46,6 +44,9 @@ def export_csv(infile: Path, outdir: Path) -> None:
|
|
|
46
44
|
required=True,
|
|
47
45
|
)
|
|
48
46
|
def export_excel(infile: Path, outfile: Path) -> None:
|
|
47
|
+
# lazt load openpyxl
|
|
48
|
+
from followthemoney.export.excel import ExcelExporter
|
|
49
|
+
|
|
49
50
|
exporter = ExcelExporter(outfile)
|
|
50
51
|
export_stream(exporter, infile)
|
|
51
52
|
|
|
@@ -60,6 +61,9 @@ def export_excel(infile: Path, outfile: Path) -> None:
|
|
|
60
61
|
help="Generate full predicates",
|
|
61
62
|
)
|
|
62
63
|
def export_rdf(infile: Path, outfile: Path, qualified: bool = True) -> None:
|
|
64
|
+
# Lazy load rdflib
|
|
65
|
+
from followthemoney.export.rdf import RDFExporter
|
|
66
|
+
|
|
63
67
|
with text_out(outfile) as fh:
|
|
64
68
|
exporter = RDFExporter(fh, qualified=qualified)
|
|
65
69
|
export_stream(exporter, infile)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Generator, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from followthemoney.cli.cli import cli
|
|
7
|
+
from followthemoney.cli.util import InPath, OutPath
|
|
8
|
+
from followthemoney.cli.util import path_entities, write_entity, path_writer
|
|
9
|
+
from followthemoney.dataset import Dataset, DefaultDataset
|
|
10
|
+
from followthemoney.statement import Statement, StatementEntity
|
|
11
|
+
from followthemoney.statement import FORMATS, CSV
|
|
12
|
+
from followthemoney.statement import write_statements
|
|
13
|
+
from followthemoney.statement import read_path_statements
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@cli.command("statements", help="Export entities to statements")
|
|
17
|
+
@click.argument("path", type=InPath)
|
|
18
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
|
19
|
+
@click.option("-d", "--dataset", type=str, required=True)
|
|
20
|
+
@click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
|
|
21
|
+
def entity_statements(path: Path, outpath: Path, dataset: str, format: str) -> None:
|
|
22
|
+
def make_statements() -> Generator[Statement, None, None]:
|
|
23
|
+
for entity in path_entities(path, StatementEntity):
|
|
24
|
+
yield from Statement.from_entity(entity, dataset=dataset)
|
|
25
|
+
|
|
26
|
+
with path_writer(outpath) as outfh:
|
|
27
|
+
write_statements(outfh, format, make_statements())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@cli.command("format-statements", help="Convert entity data formats")
|
|
31
|
+
@click.option("-i", "--infile", type=InPath, default="-")
|
|
32
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
|
33
|
+
@click.option("-f", "--in-format", type=click.Choice(FORMATS), default=CSV)
|
|
34
|
+
@click.option("-x", "--out-format", type=click.Choice(FORMATS), default=CSV)
|
|
35
|
+
def format_statements(
|
|
36
|
+
infile: Path, outpath: Path, in_format: str, out_format: str
|
|
37
|
+
) -> None:
|
|
38
|
+
statements = read_path_statements(infile, format=in_format)
|
|
39
|
+
with path_writer(outpath) as outfh:
|
|
40
|
+
write_statements(outfh, out_format, statements)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@cli.command("aggregate-statements", help="Roll up statements into entities")
|
|
44
|
+
@click.option("-i", "--infile", type=InPath, default="-")
|
|
45
|
+
@click.option("-o", "--outpath", type=OutPath, default="-")
|
|
46
|
+
@click.option("-d", "--dataset", type=str, default=DefaultDataset.name)
|
|
47
|
+
@click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
|
|
48
|
+
def statements_aggregate(
|
|
49
|
+
infile: Path, outpath: Path, dataset: str, format: str
|
|
50
|
+
) -> None:
|
|
51
|
+
dataset_ = Dataset.make({"name": dataset, "title": dataset})
|
|
52
|
+
with path_writer(outpath) as outfh:
|
|
53
|
+
statements: List[Statement] = []
|
|
54
|
+
for stmt in read_path_statements(infile, format=format):
|
|
55
|
+
if len(statements) and statements[0].canonical_id != stmt.canonical_id:
|
|
56
|
+
entity = StatementEntity.from_statements(dataset_, statements)
|
|
57
|
+
write_entity(outfh, entity)
|
|
58
|
+
statements = []
|
|
59
|
+
statements.append(stmt)
|
|
60
|
+
if len(statements):
|
|
61
|
+
entity = StatementEntity.from_statements(dataset_, statements)
|
|
62
|
+
write_entity(outfh, entity)
|
followthemoney/cli/util.py
CHANGED
|
@@ -9,7 +9,6 @@ from warnings import warn
|
|
|
9
9
|
from typing import Any, BinaryIO, Generator, Optional, TextIO, Type
|
|
10
10
|
from banal import is_mapping, is_listish, ensure_list
|
|
11
11
|
|
|
12
|
-
from followthemoney import model
|
|
13
12
|
from followthemoney.export.common import Exporter
|
|
14
13
|
from followthemoney.proxy import E, EntityProxy
|
|
15
14
|
from followthemoney.util import MEGABYTE, PathLike
|
|
@@ -39,7 +38,7 @@ def write_entity(fh: BinaryIO, entity: E) -> None:
|
|
|
39
38
|
|
|
40
39
|
def _read_one(data: Any, cleaned: bool = True) -> Generator[EntityProxy, None, None]:
|
|
41
40
|
if is_mapping(data) and "schema" in data:
|
|
42
|
-
yield
|
|
41
|
+
yield EntityProxy.from_dict(data, cleaned=cleaned)
|
|
43
42
|
|
|
44
43
|
|
|
45
44
|
def read_entities(
|
|
@@ -79,7 +78,7 @@ def binary_entities(
|
|
|
79
78
|
) -> Generator[E, None, None]:
|
|
80
79
|
while line := fh.readline(max_line):
|
|
81
80
|
data = orjson.loads(line)
|
|
82
|
-
yield entity_type.from_dict(
|
|
81
|
+
yield entity_type.from_dict(data, cleaned=cleaned)
|
|
83
82
|
|
|
84
83
|
|
|
85
84
|
def path_entities(
|
followthemoney/compare.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import math
|
|
2
|
-
import
|
|
2
|
+
from itertools import islice, product
|
|
3
3
|
from typing import Dict, Generator, Iterable, List, Optional
|
|
4
|
-
import fingerprints
|
|
5
4
|
from normality import normalize
|
|
5
|
+
from rigour.names import tokenize_name, remove_person_prefixes
|
|
6
|
+
from rigour.names import replace_org_types_compare
|
|
6
7
|
from followthemoney.exc import InvalidData
|
|
7
|
-
from followthemoney.
|
|
8
|
+
from followthemoney.schema import Schema
|
|
8
9
|
from followthemoney.types import registry
|
|
9
10
|
from followthemoney.proxy import EntityProxy
|
|
10
11
|
from followthemoney.types.common import PropertyType
|
|
@@ -21,16 +22,15 @@ COMPARE_WEIGHTS: Weights = {
|
|
|
21
22
|
registry.address: 6.456137299747168,
|
|
22
23
|
registry.phone: 3.538892687331418,
|
|
23
24
|
registry.email: 14.115925628770384,
|
|
24
|
-
registry.iban: 0.019140301711998726,
|
|
25
25
|
registry.url: 3.211995327345834,
|
|
26
26
|
None: -11.91521189545115,
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
def compare_scores(
|
|
30
|
+
def compare_scores(left: EntityProxy, right: EntityProxy) -> Scores:
|
|
31
31
|
"""Compare two entities and return a match score for each property."""
|
|
32
32
|
try:
|
|
33
|
-
model.common_schema(left.schema, right.schema)
|
|
33
|
+
common = left.schema.model.common_schema(left.schema, right.schema)
|
|
34
34
|
except InvalidData:
|
|
35
35
|
return {}
|
|
36
36
|
scores: Scores = {}
|
|
@@ -42,7 +42,7 @@ def compare_scores(model: Model, left: EntityProxy, right: EntityProxy) -> Score
|
|
|
42
42
|
group = registry.groups[group_name]
|
|
43
43
|
try:
|
|
44
44
|
if group == registry.name:
|
|
45
|
-
score = compare_names(left, right)
|
|
45
|
+
score = compare_names(common, left, right)
|
|
46
46
|
elif group == registry.country:
|
|
47
47
|
score = compare_countries(left, right)
|
|
48
48
|
else:
|
|
@@ -71,28 +71,38 @@ def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def compare(
|
|
74
|
-
model: Model,
|
|
75
74
|
left: EntityProxy,
|
|
76
75
|
right: EntityProxy,
|
|
77
76
|
weights: Weights = COMPARE_WEIGHTS,
|
|
78
77
|
) -> float:
|
|
79
78
|
"""Compare two entities and return a match score."""
|
|
80
|
-
scores = compare_scores(
|
|
79
|
+
scores = compare_scores(left, right)
|
|
81
80
|
return _compare(scores, weights)
|
|
82
81
|
|
|
83
82
|
|
|
84
|
-
def _normalize_names(
|
|
83
|
+
def _normalize_names(
|
|
84
|
+
schema: Schema, names: Iterable[str]
|
|
85
|
+
) -> Generator[str, None, None]:
|
|
85
86
|
"""Generate a sequence of comparable names for an entity. This also
|
|
86
|
-
generates a
|
|
87
|
+
generates a fingerprint, i.e. a version of the name where all tokens
|
|
87
88
|
are sorted alphabetically, and some parts, such as company suffixes,
|
|
88
89
|
have been removed."""
|
|
89
90
|
seen = set()
|
|
91
|
+
can_person = schema.is_a("LegalEntity") and not schema.is_a("Organization")
|
|
92
|
+
can_org = schema.is_a("LegalEntity") and not schema.is_a("Person")
|
|
90
93
|
for name in names:
|
|
91
94
|
plain = normalize(name, ascii=True)
|
|
92
95
|
if plain is not None and plain not in seen:
|
|
93
96
|
seen.add(plain)
|
|
94
97
|
yield plain
|
|
95
|
-
|
|
98
|
+
if not can_org and not can_person:
|
|
99
|
+
continue
|
|
100
|
+
if can_person:
|
|
101
|
+
name = remove_person_prefixes(name)
|
|
102
|
+
if can_org:
|
|
103
|
+
name = replace_org_types_compare(name)
|
|
104
|
+
tokens = tokenize_name(name.lower())
|
|
105
|
+
fp = " ".join(sorted(tokens))
|
|
96
106
|
if fp is not None and len(fp) > 6 and fp not in seen:
|
|
97
107
|
seen.add(fp)
|
|
98
108
|
yield fp
|
|
@@ -109,16 +119,16 @@ def compare_group(
|
|
|
109
119
|
|
|
110
120
|
|
|
111
121
|
def compare_names(
|
|
112
|
-
left: EntityProxy, right: EntityProxy, max_names: int = 200
|
|
122
|
+
common: Schema, left: EntityProxy, right: EntityProxy, max_names: int = 200
|
|
113
123
|
) -> Optional[float]:
|
|
114
124
|
result = 0.0
|
|
115
|
-
left_list = list(
|
|
116
|
-
right_list = list(
|
|
125
|
+
left_list = list(islice(_normalize_names(common, left.names), max_names))
|
|
126
|
+
right_list = list(islice(_normalize_names(common, right.names), max_names))
|
|
117
127
|
if not left_list and not right_list:
|
|
118
128
|
raise ValueError("At least one proxy must have name properties")
|
|
119
129
|
elif not left_list or not right_list:
|
|
120
130
|
return None
|
|
121
|
-
for
|
|
131
|
+
for left_val, right_val in product(left_list, right_list):
|
|
122
132
|
similarity = registry.name.compare(left_val, right_val)
|
|
123
133
|
result = max(result, similarity)
|
|
124
134
|
if result == 1.0:
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from followthemoney.dataset.dataset import Dataset, DS
|
|
2
|
+
from followthemoney.dataset.catalog import DataCatalog
|
|
3
|
+
from followthemoney.dataset.resource import DataResource
|
|
4
|
+
from followthemoney.dataset.publisher import DataPublisher
|
|
5
|
+
from followthemoney.dataset.coverage import DataCoverage
|
|
6
|
+
|
|
7
|
+
DefaultDataset = Dataset.make({"name": "default"})
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Dataset",
|
|
11
|
+
"DefaultDataset",
|
|
12
|
+
"DataCatalog",
|
|
13
|
+
"DataResource",
|
|
14
|
+
"DataPublisher",
|
|
15
|
+
"DataCoverage",
|
|
16
|
+
"DS",
|
|
17
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
from typing import Optional, Dict, Any, Generic, Set, Type, List
|
|
3
|
+
|
|
4
|
+
from followthemoney.types import registry
|
|
5
|
+
from followthemoney.dataset.dataset import DS
|
|
6
|
+
from followthemoney.exc import MetadataException
|
|
7
|
+
from followthemoney.util import PathLike
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataCatalog(Generic[DS]):
|
|
11
|
+
"""A data catalog is a collection of datasets. It provides methods for retrieving or
|
|
12
|
+
creating datasets, and for checking if a dataset exists in the catalog."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, dataset_type: Type[DS], data: Dict[str, Any]) -> None:
|
|
15
|
+
self.dataset_type = dataset_type
|
|
16
|
+
self.datasets: List[DS] = []
|
|
17
|
+
for ddata in data.get("datasets", []):
|
|
18
|
+
self.make_dataset(ddata)
|
|
19
|
+
self.updated_at: Optional[str] = None
|
|
20
|
+
if "updated_at" in data:
|
|
21
|
+
raw = data.get("updated_at")
|
|
22
|
+
self.updated_at = registry.date.clean(raw)
|
|
23
|
+
if self.updated_at is None:
|
|
24
|
+
raise MetadataException("Invalid update date: %r" % raw)
|
|
25
|
+
|
|
26
|
+
def add(self, dataset: "DS") -> None:
|
|
27
|
+
"""Add a dataset to the catalog. If the dataset already exists, it will be updated."""
|
|
28
|
+
for existing in self.datasets:
|
|
29
|
+
if existing.name in dataset.model.children:
|
|
30
|
+
dataset.children.add(existing)
|
|
31
|
+
if dataset.name in existing.model.children:
|
|
32
|
+
existing.children.add(dataset)
|
|
33
|
+
self.datasets.append(dataset)
|
|
34
|
+
|
|
35
|
+
def make_dataset(self, data: Dict[str, Any]) -> "DS":
|
|
36
|
+
"""Create a new dataset from the given data. If a dataset with the same name already
|
|
37
|
+
exists, it will be updated."""
|
|
38
|
+
dataset = self.dataset_type(data)
|
|
39
|
+
self.add(dataset)
|
|
40
|
+
return dataset
|
|
41
|
+
|
|
42
|
+
def get(self, name: str) -> Optional["DS"]:
|
|
43
|
+
"""Get a dataset by name. Returns None if the dataset does not exist."""
|
|
44
|
+
for ds in self.datasets:
|
|
45
|
+
if ds.name == name:
|
|
46
|
+
return ds
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
def require(self, name: str) -> "DS":
|
|
50
|
+
"""Get a dataset by name. Raises MetadataException if the dataset does not exist."""
|
|
51
|
+
dataset = self.get(name)
|
|
52
|
+
if dataset is None:
|
|
53
|
+
raise MetadataException("No such dataset: %s" % name)
|
|
54
|
+
return dataset
|
|
55
|
+
|
|
56
|
+
def has(self, name: str) -> bool:
|
|
57
|
+
"""Check if a dataset exists in the catalog."""
|
|
58
|
+
return name in self.names
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def names(self) -> Set[str]:
|
|
62
|
+
"""Get the names of all datasets in the catalog."""
|
|
63
|
+
return {d.name for d in self.datasets}
|
|
64
|
+
|
|
65
|
+
def __repr__(self) -> str: # pragma: no cover
|
|
66
|
+
return f"<DataCatalog[{self.dataset_type.__name__}]({self.names!r})>"
|
|
67
|
+
|
|
68
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
69
|
+
return {
|
|
70
|
+
"datasets": [d.to_dict() for d in self.datasets],
|
|
71
|
+
"updated_at": self.updated_at,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_path(cls, dataset_type: Type[DS], path: PathLike) -> "DataCatalog[DS]":
|
|
76
|
+
with open(path, "r") as fh:
|
|
77
|
+
return cls(dataset_type, yaml.safe_load(fh))
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import List, Literal, Optional, TypeAlias
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
from followthemoney.dataset.util import CountryCode, PartialDate
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Derived from Aleph
|
|
8
|
+
FREQUENCY_TYPE: TypeAlias = Literal[
|
|
9
|
+
"unknown",
|
|
10
|
+
"never",
|
|
11
|
+
"hourly",
|
|
12
|
+
"daily",
|
|
13
|
+
"weekly",
|
|
14
|
+
"monthly",
|
|
15
|
+
"annually",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataCoverage(BaseModel):
|
|
20
|
+
"""Details on the temporal and geographic scope of a dataset."""
|
|
21
|
+
|
|
22
|
+
start: Optional[PartialDate] = None
|
|
23
|
+
end: Optional[PartialDate] = None
|
|
24
|
+
countries: List[CountryCode] = []
|
|
25
|
+
frequency: FREQUENCY_TYPE = "unknown"
|
|
26
|
+
schedule: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
def __repr__(self) -> str:
|
|
29
|
+
return f"<DataCoverage({self.start!r}, {self.end!r}, {self.countries!r})>"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
import logging
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
from typing_extensions import Self
|
|
6
|
+
from typing import Any, Dict, List, Optional, Set, Type, TypeVar
|
|
7
|
+
from pydantic import BaseModel, field_validator, model_validator
|
|
8
|
+
|
|
9
|
+
from followthemoney.dataset.coverage import DataCoverage
|
|
10
|
+
from followthemoney.dataset.publisher import DataPublisher
|
|
11
|
+
from followthemoney.dataset.resource import DataResource
|
|
12
|
+
from followthemoney.dataset.util import Url, DateTimeISO, dataset_name_check
|
|
13
|
+
from followthemoney.util import PathLike
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from followthemoney.dataset.catalog import DataCatalog
|
|
17
|
+
|
|
18
|
+
DS = TypeVar("DS", bound="Dataset")
|
|
19
|
+
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatasetModel(BaseModel):
|
|
24
|
+
name: str
|
|
25
|
+
title: str
|
|
26
|
+
license: Optional[Url] = None
|
|
27
|
+
summary: Optional[str] = None
|
|
28
|
+
description: Optional[str] = None
|
|
29
|
+
url: Optional[Url] = None
|
|
30
|
+
updated_at: Optional[DateTimeISO] = None
|
|
31
|
+
last_export: Optional[DateTimeISO] = None
|
|
32
|
+
entity_count: Optional[int] = None
|
|
33
|
+
thing_count: Optional[int] = None
|
|
34
|
+
version: Optional[str] = None
|
|
35
|
+
category: Optional[str] = None
|
|
36
|
+
tags: List[str] = []
|
|
37
|
+
publisher: DataPublisher | None = None
|
|
38
|
+
coverage: DataCoverage | None = None
|
|
39
|
+
resources: List[DataResource] = []
|
|
40
|
+
children: Set[str] = set()
|
|
41
|
+
|
|
42
|
+
@field_validator("name", mode="after")
|
|
43
|
+
@classmethod
|
|
44
|
+
def check_name(cls, value: str) -> str:
|
|
45
|
+
return dataset_name_check(value)
|
|
46
|
+
|
|
47
|
+
@model_validator(mode="before")
|
|
48
|
+
@classmethod
|
|
49
|
+
def ensure_data(cls, data: Any) -> Any:
|
|
50
|
+
if isinstance(data, dict):
|
|
51
|
+
if "name" not in data:
|
|
52
|
+
raise ValueError("Missing dataset name")
|
|
53
|
+
data["title"] = data.get("title", data["name"])
|
|
54
|
+
children = set(data.get("children", []))
|
|
55
|
+
children.update(data.get("datasets", []))
|
|
56
|
+
children.update(data.get("scopes", []))
|
|
57
|
+
data["children"] = children
|
|
58
|
+
return data
|
|
59
|
+
|
|
60
|
+
def get_resource(self, name: str) -> DataResource:
|
|
61
|
+
for res in self.resources:
|
|
62
|
+
if res.name == name:
|
|
63
|
+
return res
|
|
64
|
+
raise ValueError("No resource named %r!" % name)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Dataset:
|
|
68
|
+
"""A container for entities, often from one source or related to one topic.
|
|
69
|
+
A dataset is a set of data, sez W3C."""
|
|
70
|
+
|
|
71
|
+
Model = DatasetModel
|
|
72
|
+
|
|
73
|
+
def __init__(self: Self, data: Dict[str, Any]) -> None:
|
|
74
|
+
self.model = self.Model.model_validate(data)
|
|
75
|
+
self.name = self.model.name
|
|
76
|
+
self.children: Set[Self] = set()
|
|
77
|
+
|
|
78
|
+
@cached_property
|
|
79
|
+
def is_collection(self: Self) -> bool:
|
|
80
|
+
return len(self.model.children) > 0
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def datasets(self: Self) -> Set[Self]:
|
|
84
|
+
current: Set[Self] = set([self])
|
|
85
|
+
for child in self.children:
|
|
86
|
+
current.update(child.datasets)
|
|
87
|
+
return current
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def dataset_names(self: Self) -> List[str]:
|
|
91
|
+
return [d.name for d in self.datasets]
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def leaves(self: Self) -> Set[Self]:
|
|
95
|
+
"""All contained datasets which are not collections (can be 'self')."""
|
|
96
|
+
return set([d for d in self.datasets if not d.is_collection])
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def leaf_names(self: Self) -> Set[str]:
|
|
100
|
+
return {d.name for d in self.leaves}
|
|
101
|
+
|
|
102
|
+
def __hash__(self) -> int:
|
|
103
|
+
return hash(repr(self))
|
|
104
|
+
|
|
105
|
+
def __repr__(self) -> str:
|
|
106
|
+
if not hasattr(self, "name"):
|
|
107
|
+
return "<Dataset>"
|
|
108
|
+
return f"<Dataset({self.name})>" # pragma: no cover
|
|
109
|
+
|
|
110
|
+
def get_resource(self, name: str) -> DataResource:
|
|
111
|
+
for res in self.model.resources:
|
|
112
|
+
if res.name == name:
|
|
113
|
+
return res
|
|
114
|
+
raise ValueError("No resource named %r!" % name)
|
|
115
|
+
|
|
116
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
117
|
+
"""Convert the dataset to a dictionary representation."""
|
|
118
|
+
return self.model.model_dump(mode="json", exclude_none=True)
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def from_path(
|
|
122
|
+
cls: Type[DS], path: PathLike, catalog: Optional["DataCatalog[DS]"] = None
|
|
123
|
+
) -> DS:
|
|
124
|
+
from followthemoney.dataset.catalog import DataCatalog
|
|
125
|
+
|
|
126
|
+
with open(path, "r") as fh:
|
|
127
|
+
data = yaml.safe_load(fh)
|
|
128
|
+
if catalog is None:
|
|
129
|
+
catalog = DataCatalog(cls, {})
|
|
130
|
+
return catalog.make_dataset(data)
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def make(cls: Type[DS], data: Dict[str, Any]) -> DS:
|
|
134
|
+
from followthemoney.dataset.catalog import DataCatalog
|
|
135
|
+
|
|
136
|
+
catalog = DataCatalog(cls, {})
|
|
137
|
+
return catalog.make_dataset(data)
|
|
138
|
+
|
|
139
|
+
def __eq__(self, other: Any) -> bool:
|
|
140
|
+
try:
|
|
141
|
+
return not not self.name == other.name
|
|
142
|
+
except AttributeError:
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
def __lt__(self, other: Any) -> bool:
|
|
146
|
+
return self.name.__lt__(other.name)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from followthemoney.dataset.util import CountryCode, Url
|
|
6
|
+
from followthemoney.types import registry
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataPublisher(BaseModel):
|
|
10
|
+
"""Publisher information, eg. the government authority."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
url: Optional[Url] = None
|
|
14
|
+
name_en: Optional[str] = None
|
|
15
|
+
acronym: Optional[str] = None
|
|
16
|
+
description: Optional[str] = None
|
|
17
|
+
country: Optional[CountryCode] = None
|
|
18
|
+
official: Optional[bool] = False
|
|
19
|
+
logo_url: Optional[Url] = None
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def country_label(self) -> Optional[str]:
|
|
23
|
+
if self.country is None:
|
|
24
|
+
return None
|
|
25
|
+
return registry.country.caption(self.country)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from pydantic import BaseModel, field_validator
|
|
3
|
+
|
|
4
|
+
from followthemoney.dataset.util import Url, DateTimeISO
|
|
5
|
+
from followthemoney.types import registry
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataResource(BaseModel):
|
|
9
|
+
"""A downloadable resource that is part of a dataset."""
|
|
10
|
+
|
|
11
|
+
name: str
|
|
12
|
+
url: Optional[Url] = None
|
|
13
|
+
checksum: Optional[str] = None
|
|
14
|
+
timestamp: Optional[DateTimeISO] = None
|
|
15
|
+
mime_type: Optional[str] = None
|
|
16
|
+
title: Optional[str] = None
|
|
17
|
+
size: Optional[int] = None
|
|
18
|
+
|
|
19
|
+
@field_validator("mime_type", mode="after")
|
|
20
|
+
@classmethod
|
|
21
|
+
def ensure_mime_type(cls, value: str) -> Optional[str]:
|
|
22
|
+
if not registry.mimetype.validate(value):
|
|
23
|
+
raise ValueError(f"Invalid MIME type: {value!r}")
|
|
24
|
+
return value
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def mime_type_label(self) -> Optional[str]:
|
|
28
|
+
if self.mime_type is None:
|
|
29
|
+
return None
|
|
30
|
+
return registry.mimetype.caption(self.mime_type)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from normality import slugify
|
|
3
|
+
from typing import Annotated, Any
|
|
4
|
+
from rigour.time import datetime_iso
|
|
5
|
+
from pydantic import AfterValidator, BeforeValidator, HttpUrl, PlainSerializer
|
|
6
|
+
|
|
7
|
+
from followthemoney.types import registry
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def dataset_name_check(value: str) -> str:
|
|
11
|
+
"""Check that the given value is a valid dataset name. This doesn't convert
|
|
12
|
+
or clean invalid names, but raises an error if they are not compliant to
|
|
13
|
+
force the user to fix an invalid name"""
|
|
14
|
+
if slugify(value, sep="_") != value:
|
|
15
|
+
raise ValueError("Invalid %s: %r" % ("dataset name", value))
|
|
16
|
+
return value
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def type_check_date(value: Any) -> str:
|
|
20
|
+
"""Check that the given value is a valid date string."""
|
|
21
|
+
cleaned = registry.date.clean(value)
|
|
22
|
+
if cleaned is None:
|
|
23
|
+
raise ValueError("Invalid date: %r" % value)
|
|
24
|
+
return cleaned
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
PartialDate = Annotated[str, BeforeValidator(type_check_date)]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def type_check_country(value: Any) -> str:
|
|
31
|
+
"""Check that the given value is a valid country code."""
|
|
32
|
+
cleaned = registry.country.clean(value)
|
|
33
|
+
if cleaned is None:
|
|
34
|
+
raise ValueError("Invalid country code: %r" % value)
|
|
35
|
+
return cleaned
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
CountryCode = Annotated[str, BeforeValidator(type_check_country)]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def type_check_http_url(v: str) -> str:
|
|
42
|
+
url = HttpUrl(v)
|
|
43
|
+
return str(url)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
Url = Annotated[str, AfterValidator(type_check_http_url)]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def serialize_dt(dt: datetime) -> str:
|
|
50
|
+
text = datetime_iso(dt)
|
|
51
|
+
assert text is not None, "Invalid datetime: %r" % dt
|
|
52
|
+
return text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
DateTimeISO = Annotated[datetime, PlainSerializer(serialize_dt)]
|