followthemoney 3.8.5__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. followthemoney/__init__.py +30 -10
  2. followthemoney/cli/cli.py +1 -1
  3. followthemoney/cli/exports.py +6 -2
  4. followthemoney/cli/statement.py +62 -0
  5. followthemoney/cli/util.py +2 -3
  6. followthemoney/compare.py +26 -16
  7. followthemoney/dataset/__init__.py +17 -0
  8. followthemoney/dataset/catalog.py +77 -0
  9. followthemoney/dataset/coverage.py +29 -0
  10. followthemoney/dataset/dataset.py +146 -0
  11. followthemoney/dataset/publisher.py +25 -0
  12. followthemoney/dataset/resource.py +30 -0
  13. followthemoney/dataset/util.py +55 -0
  14. followthemoney/entity.py +73 -0
  15. followthemoney/exc.py +6 -0
  16. followthemoney/export/rdf.py +57 -5
  17. followthemoney/graph.py +1 -2
  18. followthemoney/model.py +38 -11
  19. followthemoney/names.py +33 -0
  20. followthemoney/ontology.py +18 -16
  21. followthemoney/property.py +12 -15
  22. followthemoney/proxy.py +43 -64
  23. followthemoney/schema/Analyzable.yaml +2 -3
  24. followthemoney/schema/BankAccount.yaml +2 -3
  25. followthemoney/schema/Company.yaml +0 -6
  26. followthemoney/schema/Contract.yaml +0 -1
  27. followthemoney/schema/CryptoWallet.yaml +1 -1
  28. followthemoney/schema/Document.yaml +0 -6
  29. followthemoney/schema/Interval.yaml +7 -0
  30. followthemoney/schema/LegalEntity.yaml +6 -0
  31. followthemoney/schema/License.yaml +2 -0
  32. followthemoney/schema/Page.yaml +0 -1
  33. followthemoney/schema/Person.yaml +0 -5
  34. followthemoney/schema/Sanction.yaml +1 -0
  35. followthemoney/schema/Thing.yaml +0 -2
  36. followthemoney/schema/UserAccount.yaml +6 -3
  37. followthemoney/schema.py +30 -42
  38. followthemoney/statement/__init__.py +19 -0
  39. followthemoney/statement/entity.py +438 -0
  40. followthemoney/statement/serialize.py +251 -0
  41. followthemoney/statement/statement.py +256 -0
  42. followthemoney/statement/util.py +31 -0
  43. followthemoney/types/__init__.py +66 -23
  44. followthemoney/types/address.py +3 -3
  45. followthemoney/types/checksum.py +3 -7
  46. followthemoney/types/common.py +9 -14
  47. followthemoney/types/country.py +3 -7
  48. followthemoney/types/date.py +21 -11
  49. followthemoney/types/email.py +0 -4
  50. followthemoney/types/entity.py +5 -11
  51. followthemoney/types/gender.py +6 -10
  52. followthemoney/types/identifier.py +9 -3
  53. followthemoney/types/ip.py +5 -9
  54. followthemoney/types/json.py +2 -2
  55. followthemoney/types/language.py +3 -7
  56. followthemoney/types/mimetype.py +4 -8
  57. followthemoney/types/name.py +7 -8
  58. followthemoney/types/number.py +88 -6
  59. followthemoney/types/phone.py +4 -11
  60. followthemoney/types/string.py +4 -4
  61. followthemoney/types/topic.py +3 -7
  62. followthemoney/types/url.py +5 -10
  63. followthemoney/util.py +12 -13
  64. followthemoney/value.py +67 -0
  65. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/METADATA +23 -8
  66. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/RECORD +69 -59
  67. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/entry_points.txt +1 -0
  68. followthemoney/offshore.py +0 -48
  69. followthemoney/rdf.py +0 -9
  70. followthemoney/schema/Assessment.yaml +0 -32
  71. followthemoney/schema/Post.yaml +0 -42
  72. followthemoney/types/iban.py +0 -58
  73. followthemoney/types/registry.py +0 -52
  74. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/WHEEL +0 -0
  75. {followthemoney-3.8.5.dist-info → followthemoney-4.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,36 @@
1
- import os
2
-
1
+ from followthemoney.entity import ValueEntity, VE
3
2
  from followthemoney.model import Model
3
+ from followthemoney.schema import Schema
4
+ from followthemoney.property import Property
5
+ from followthemoney.types import registry
6
+ from followthemoney.value import Value, Values
7
+ from followthemoney.proxy import EntityProxy, E
8
+ from followthemoney.statement import Statement, StatementEntity, SE
9
+ from followthemoney.dataset import Dataset, DefaultDataset, DS
4
10
  from followthemoney.util import set_model_locale
5
11
 
6
- __version__ = "3.8.5"
7
-
8
-
9
- model_path = os.path.dirname(__file__)
10
- model_path = os.path.join(model_path, "schema")
11
- model_path = os.environ.get("FTM_MODEL_PATH", model_path)
12
+ __version__ = "4.0.1"
12
13
 
13
14
  # Data model singleton
14
- model = Model(model_path)
15
+ model = Model.instance()
15
16
 
16
- __all__ = ["model", "set_model_locale"]
17
+ __all__ = [
18
+ "model",
19
+ "set_model_locale",
20
+ "Model",
21
+ "Schema",
22
+ "Property",
23
+ "Value",
24
+ "Values",
25
+ "EntityProxy",
26
+ "E",
27
+ "registry",
28
+ "Dataset",
29
+ "DefaultDataset",
30
+ "DS",
31
+ "Statement",
32
+ "StatementEntity",
33
+ "SE",
34
+ "ValueEntity",
35
+ "VE",
36
+ ]
followthemoney/cli/cli.py CHANGED
@@ -56,7 +56,7 @@ def import_vis(infile: Path, outfile: Path) -> None:
56
56
  else:
57
57
  raise click.ClickException("No entities found in VIS file")
58
58
  for entity_data in ensure_list(entities):
59
- entity = EntityProxy.from_dict(model, entity_data)
59
+ entity = EntityProxy.from_dict(entity_data)
60
60
  write_entity(outfh, entity)
61
61
 
62
62
 
@@ -6,8 +6,6 @@ from contextlib import contextmanager
6
6
  from followthemoney.cli.cli import cli
7
7
  from followthemoney.cli.util import InPath, OutPath, export_stream
8
8
  from followthemoney.export.csv import CSVExporter
9
- from followthemoney.export.rdf import RDFExporter
10
- from followthemoney.export.excel import ExcelExporter
11
9
  from followthemoney.export.graph import edge_types, DEFAULT_EDGE_TYPES
12
10
  from followthemoney.export.graph import NXGraphExporter
13
11
  from followthemoney.export.neo4j import Neo4JCSVExporter
@@ -46,6 +44,9 @@ def export_csv(infile: Path, outdir: Path) -> None:
46
44
  required=True,
47
45
  )
48
46
  def export_excel(infile: Path, outfile: Path) -> None:
47
+ # lazt load openpyxl
48
+ from followthemoney.export.excel import ExcelExporter
49
+
49
50
  exporter = ExcelExporter(outfile)
50
51
  export_stream(exporter, infile)
51
52
 
@@ -60,6 +61,9 @@ def export_excel(infile: Path, outfile: Path) -> None:
60
61
  help="Generate full predicates",
61
62
  )
62
63
  def export_rdf(infile: Path, outfile: Path, qualified: bool = True) -> None:
64
+ # Lazy load rdflib
65
+ from followthemoney.export.rdf import RDFExporter
66
+
63
67
  with text_out(outfile) as fh:
64
68
  exporter = RDFExporter(fh, qualified=qualified)
65
69
  export_stream(exporter, infile)
@@ -0,0 +1,62 @@
1
+ import click
2
+ from pathlib import Path
3
+ from typing import Generator, List
4
+
5
+
6
+ from followthemoney.cli.cli import cli
7
+ from followthemoney.cli.util import InPath, OutPath
8
+ from followthemoney.cli.util import path_entities, write_entity, path_writer
9
+ from followthemoney.dataset import Dataset, DefaultDataset
10
+ from followthemoney.statement import Statement, StatementEntity
11
+ from followthemoney.statement import FORMATS, CSV
12
+ from followthemoney.statement import write_statements
13
+ from followthemoney.statement import read_path_statements
14
+
15
+
16
+ @cli.command("statements", help="Export entities to statements")
17
+ @click.argument("path", type=InPath)
18
+ @click.option("-o", "--outpath", type=OutPath, default="-")
19
+ @click.option("-d", "--dataset", type=str, required=True)
20
+ @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
21
+ def entity_statements(path: Path, outpath: Path, dataset: str, format: str) -> None:
22
+ def make_statements() -> Generator[Statement, None, None]:
23
+ for entity in path_entities(path, StatementEntity):
24
+ yield from Statement.from_entity(entity, dataset=dataset)
25
+
26
+ with path_writer(outpath) as outfh:
27
+ write_statements(outfh, format, make_statements())
28
+
29
+
30
+ @cli.command("format-statements", help="Convert entity data formats")
31
+ @click.option("-i", "--infile", type=InPath, default="-")
32
+ @click.option("-o", "--outpath", type=OutPath, default="-")
33
+ @click.option("-f", "--in-format", type=click.Choice(FORMATS), default=CSV)
34
+ @click.option("-x", "--out-format", type=click.Choice(FORMATS), default=CSV)
35
+ def format_statements(
36
+ infile: Path, outpath: Path, in_format: str, out_format: str
37
+ ) -> None:
38
+ statements = read_path_statements(infile, format=in_format)
39
+ with path_writer(outpath) as outfh:
40
+ write_statements(outfh, out_format, statements)
41
+
42
+
43
+ @cli.command("aggregate-statements", help="Roll up statements into entities")
44
+ @click.option("-i", "--infile", type=InPath, default="-")
45
+ @click.option("-o", "--outpath", type=OutPath, default="-")
46
+ @click.option("-d", "--dataset", type=str, default=DefaultDataset.name)
47
+ @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
48
+ def statements_aggregate(
49
+ infile: Path, outpath: Path, dataset: str, format: str
50
+ ) -> None:
51
+ dataset_ = Dataset.make({"name": dataset, "title": dataset})
52
+ with path_writer(outpath) as outfh:
53
+ statements: List[Statement] = []
54
+ for stmt in read_path_statements(infile, format=format):
55
+ if len(statements) and statements[0].canonical_id != stmt.canonical_id:
56
+ entity = StatementEntity.from_statements(dataset_, statements)
57
+ write_entity(outfh, entity)
58
+ statements = []
59
+ statements.append(stmt)
60
+ if len(statements):
61
+ entity = StatementEntity.from_statements(dataset_, statements)
62
+ write_entity(outfh, entity)
@@ -9,7 +9,6 @@ from warnings import warn
9
9
  from typing import Any, BinaryIO, Generator, Optional, TextIO, Type
10
10
  from banal import is_mapping, is_listish, ensure_list
11
11
 
12
- from followthemoney import model
13
12
  from followthemoney.export.common import Exporter
14
13
  from followthemoney.proxy import E, EntityProxy
15
14
  from followthemoney.util import MEGABYTE, PathLike
@@ -39,7 +38,7 @@ def write_entity(fh: BinaryIO, entity: E) -> None:
39
38
 
40
39
  def _read_one(data: Any, cleaned: bool = True) -> Generator[EntityProxy, None, None]:
41
40
  if is_mapping(data) and "schema" in data:
42
- yield model.get_proxy(data, cleaned=cleaned)
41
+ yield EntityProxy.from_dict(data, cleaned=cleaned)
43
42
 
44
43
 
45
44
  def read_entities(
@@ -79,7 +78,7 @@ def binary_entities(
79
78
  ) -> Generator[E, None, None]:
80
79
  while line := fh.readline(max_line):
81
80
  data = orjson.loads(line)
82
- yield entity_type.from_dict(model, data, cleaned=cleaned)
81
+ yield entity_type.from_dict(data, cleaned=cleaned)
83
82
 
84
83
 
85
84
  def path_entities(
followthemoney/compare.py CHANGED
@@ -1,10 +1,11 @@
1
1
  import math
2
- import itertools
2
+ from itertools import islice, product
3
3
  from typing import Dict, Generator, Iterable, List, Optional
4
- import fingerprints
5
4
  from normality import normalize
5
+ from rigour.names import tokenize_name, remove_person_prefixes
6
+ from rigour.names import replace_org_types_compare
6
7
  from followthemoney.exc import InvalidData
7
- from followthemoney.model import Model
8
+ from followthemoney.schema import Schema
8
9
  from followthemoney.types import registry
9
10
  from followthemoney.proxy import EntityProxy
10
11
  from followthemoney.types.common import PropertyType
@@ -21,16 +22,15 @@ COMPARE_WEIGHTS: Weights = {
21
22
  registry.address: 6.456137299747168,
22
23
  registry.phone: 3.538892687331418,
23
24
  registry.email: 14.115925628770384,
24
- registry.iban: 0.019140301711998726,
25
25
  registry.url: 3.211995327345834,
26
26
  None: -11.91521189545115,
27
27
  }
28
28
 
29
29
 
30
- def compare_scores(model: Model, left: EntityProxy, right: EntityProxy) -> Scores:
30
+ def compare_scores(left: EntityProxy, right: EntityProxy) -> Scores:
31
31
  """Compare two entities and return a match score for each property."""
32
32
  try:
33
- model.common_schema(left.schema, right.schema)
33
+ common = left.schema.model.common_schema(left.schema, right.schema)
34
34
  except InvalidData:
35
35
  return {}
36
36
  scores: Scores = {}
@@ -42,7 +42,7 @@ def compare_scores(model: Model, left: EntityProxy, right: EntityProxy) -> Score
42
42
  group = registry.groups[group_name]
43
43
  try:
44
44
  if group == registry.name:
45
- score = compare_names(left, right)
45
+ score = compare_names(common, left, right)
46
46
  elif group == registry.country:
47
47
  score = compare_countries(left, right)
48
48
  else:
@@ -71,28 +71,38 @@ def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
71
71
 
72
72
 
73
73
  def compare(
74
- model: Model,
75
74
  left: EntityProxy,
76
75
  right: EntityProxy,
77
76
  weights: Weights = COMPARE_WEIGHTS,
78
77
  ) -> float:
79
78
  """Compare two entities and return a match score."""
80
- scores = compare_scores(model, left, right)
79
+ scores = compare_scores(left, right)
81
80
  return _compare(scores, weights)
82
81
 
83
82
 
84
- def _normalize_names(names: Iterable[str]) -> Generator[str, None, None]:
83
+ def _normalize_names(
84
+ schema: Schema, names: Iterable[str]
85
+ ) -> Generator[str, None, None]:
85
86
  """Generate a sequence of comparable names for an entity. This also
86
- generates a `fingerprint`, i.e. a version of the name where all tokens
87
+ generates a fingerprint, i.e. a version of the name where all tokens
87
88
  are sorted alphabetically, and some parts, such as company suffixes,
88
89
  have been removed."""
89
90
  seen = set()
91
+ can_person = schema.is_a("LegalEntity") and not schema.is_a("Organization")
92
+ can_org = schema.is_a("LegalEntity") and not schema.is_a("Person")
90
93
  for name in names:
91
94
  plain = normalize(name, ascii=True)
92
95
  if plain is not None and plain not in seen:
93
96
  seen.add(plain)
94
97
  yield plain
95
- fp = fingerprints.generate(name)
98
+ if not can_org and not can_person:
99
+ continue
100
+ if can_person:
101
+ name = remove_person_prefixes(name)
102
+ if can_org:
103
+ name = replace_org_types_compare(name)
104
+ tokens = tokenize_name(name.lower())
105
+ fp = " ".join(sorted(tokens))
96
106
  if fp is not None and len(fp) > 6 and fp not in seen:
97
107
  seen.add(fp)
98
108
  yield fp
@@ -109,16 +119,16 @@ def compare_group(
109
119
 
110
120
 
111
121
  def compare_names(
112
- left: EntityProxy, right: EntityProxy, max_names: int = 200
122
+ common: Schema, left: EntityProxy, right: EntityProxy, max_names: int = 200
113
123
  ) -> Optional[float]:
114
124
  result = 0.0
115
- left_list = list(itertools.islice(_normalize_names(left.names), max_names))
116
- right_list = list(itertools.islice(_normalize_names(right.names), max_names))
125
+ left_list = list(islice(_normalize_names(common, left.names), max_names))
126
+ right_list = list(islice(_normalize_names(common, right.names), max_names))
117
127
  if not left_list and not right_list:
118
128
  raise ValueError("At least one proxy must have name properties")
119
129
  elif not left_list or not right_list:
120
130
  return None
121
- for (left_val, right_val) in itertools.product(left_list, right_list):
131
+ for left_val, right_val in product(left_list, right_list):
122
132
  similarity = registry.name.compare(left_val, right_val)
123
133
  result = max(result, similarity)
124
134
  if result == 1.0:
@@ -0,0 +1,17 @@
1
+ from followthemoney.dataset.dataset import Dataset, DS
2
+ from followthemoney.dataset.catalog import DataCatalog
3
+ from followthemoney.dataset.resource import DataResource
4
+ from followthemoney.dataset.publisher import DataPublisher
5
+ from followthemoney.dataset.coverage import DataCoverage
6
+
7
+ DefaultDataset = Dataset.make({"name": "default"})
8
+
9
+ __all__ = [
10
+ "Dataset",
11
+ "DefaultDataset",
12
+ "DataCatalog",
13
+ "DataResource",
14
+ "DataPublisher",
15
+ "DataCoverage",
16
+ "DS",
17
+ ]
@@ -0,0 +1,77 @@
1
+ import yaml
2
+ from typing import Optional, Dict, Any, Generic, Set, Type, List
3
+
4
+ from followthemoney.types import registry
5
+ from followthemoney.dataset.dataset import DS
6
+ from followthemoney.exc import MetadataException
7
+ from followthemoney.util import PathLike
8
+
9
+
10
+ class DataCatalog(Generic[DS]):
11
+ """A data catalog is a collection of datasets. It provides methods for retrieving or
12
+ creating datasets, and for checking if a dataset exists in the catalog."""
13
+
14
+ def __init__(self, dataset_type: Type[DS], data: Dict[str, Any]) -> None:
15
+ self.dataset_type = dataset_type
16
+ self.datasets: List[DS] = []
17
+ for ddata in data.get("datasets", []):
18
+ self.make_dataset(ddata)
19
+ self.updated_at: Optional[str] = None
20
+ if "updated_at" in data:
21
+ raw = data.get("updated_at")
22
+ self.updated_at = registry.date.clean(raw)
23
+ if self.updated_at is None:
24
+ raise MetadataException("Invalid update date: %r" % raw)
25
+
26
+ def add(self, dataset: "DS") -> None:
27
+ """Add a dataset to the catalog. If the dataset already exists, it will be updated."""
28
+ for existing in self.datasets:
29
+ if existing.name in dataset.model.children:
30
+ dataset.children.add(existing)
31
+ if dataset.name in existing.model.children:
32
+ existing.children.add(dataset)
33
+ self.datasets.append(dataset)
34
+
35
+ def make_dataset(self, data: Dict[str, Any]) -> "DS":
36
+ """Create a new dataset from the given data. If a dataset with the same name already
37
+ exists, it will be updated."""
38
+ dataset = self.dataset_type(data)
39
+ self.add(dataset)
40
+ return dataset
41
+
42
+ def get(self, name: str) -> Optional["DS"]:
43
+ """Get a dataset by name. Returns None if the dataset does not exist."""
44
+ for ds in self.datasets:
45
+ if ds.name == name:
46
+ return ds
47
+ return None
48
+
49
+ def require(self, name: str) -> "DS":
50
+ """Get a dataset by name. Raises MetadataException if the dataset does not exist."""
51
+ dataset = self.get(name)
52
+ if dataset is None:
53
+ raise MetadataException("No such dataset: %s" % name)
54
+ return dataset
55
+
56
+ def has(self, name: str) -> bool:
57
+ """Check if a dataset exists in the catalog."""
58
+ return name in self.names
59
+
60
+ @property
61
+ def names(self) -> Set[str]:
62
+ """Get the names of all datasets in the catalog."""
63
+ return {d.name for d in self.datasets}
64
+
65
+ def __repr__(self) -> str: # pragma: no cover
66
+ return f"<DataCatalog[{self.dataset_type.__name__}]({self.names!r})>"
67
+
68
+ def to_dict(self) -> Dict[str, Any]:
69
+ return {
70
+ "datasets": [d.to_dict() for d in self.datasets],
71
+ "updated_at": self.updated_at,
72
+ }
73
+
74
+ @classmethod
75
+ def from_path(cls, dataset_type: Type[DS], path: PathLike) -> "DataCatalog[DS]":
76
+ with open(path, "r") as fh:
77
+ return cls(dataset_type, yaml.safe_load(fh))
@@ -0,0 +1,29 @@
1
+ from typing import List, Literal, Optional, TypeAlias
2
+ from pydantic import BaseModel
3
+
4
+ from followthemoney.dataset.util import CountryCode, PartialDate
5
+
6
+
7
+ # Derived from Aleph
8
+ FREQUENCY_TYPE: TypeAlias = Literal[
9
+ "unknown",
10
+ "never",
11
+ "hourly",
12
+ "daily",
13
+ "weekly",
14
+ "monthly",
15
+ "annually",
16
+ ]
17
+
18
+
19
+ class DataCoverage(BaseModel):
20
+ """Details on the temporal and geographic scope of a dataset."""
21
+
22
+ start: Optional[PartialDate] = None
23
+ end: Optional[PartialDate] = None
24
+ countries: List[CountryCode] = []
25
+ frequency: FREQUENCY_TYPE = "unknown"
26
+ schedule: Optional[str] = None
27
+
28
+ def __repr__(self) -> str:
29
+ return f"<DataCoverage({self.start!r}, {self.end!r}, {self.countries!r})>"
@@ -0,0 +1,146 @@
1
+ import yaml
2
+ import logging
3
+ from functools import cached_property
4
+ from typing import TYPE_CHECKING
5
+ from typing_extensions import Self
6
+ from typing import Any, Dict, List, Optional, Set, Type, TypeVar
7
+ from pydantic import BaseModel, field_validator, model_validator
8
+
9
+ from followthemoney.dataset.coverage import DataCoverage
10
+ from followthemoney.dataset.publisher import DataPublisher
11
+ from followthemoney.dataset.resource import DataResource
12
+ from followthemoney.dataset.util import Url, DateTimeISO, dataset_name_check
13
+ from followthemoney.util import PathLike
14
+
15
+ if TYPE_CHECKING:
16
+ from followthemoney.dataset.catalog import DataCatalog
17
+
18
+ DS = TypeVar("DS", bound="Dataset")
19
+
20
+ log = logging.getLogger(__name__)
21
+
22
+
23
+ class DatasetModel(BaseModel):
24
+ name: str
25
+ title: str
26
+ license: Optional[Url] = None
27
+ summary: Optional[str] = None
28
+ description: Optional[str] = None
29
+ url: Optional[Url] = None
30
+ updated_at: Optional[DateTimeISO] = None
31
+ last_export: Optional[DateTimeISO] = None
32
+ entity_count: Optional[int] = None
33
+ thing_count: Optional[int] = None
34
+ version: Optional[str] = None
35
+ category: Optional[str] = None
36
+ tags: List[str] = []
37
+ publisher: DataPublisher | None = None
38
+ coverage: DataCoverage | None = None
39
+ resources: List[DataResource] = []
40
+ children: Set[str] = set()
41
+
42
+ @field_validator("name", mode="after")
43
+ @classmethod
44
+ def check_name(cls, value: str) -> str:
45
+ return dataset_name_check(value)
46
+
47
+ @model_validator(mode="before")
48
+ @classmethod
49
+ def ensure_data(cls, data: Any) -> Any:
50
+ if isinstance(data, dict):
51
+ if "name" not in data:
52
+ raise ValueError("Missing dataset name")
53
+ data["title"] = data.get("title", data["name"])
54
+ children = set(data.get("children", []))
55
+ children.update(data.get("datasets", []))
56
+ children.update(data.get("scopes", []))
57
+ data["children"] = children
58
+ return data
59
+
60
+ def get_resource(self, name: str) -> DataResource:
61
+ for res in self.resources:
62
+ if res.name == name:
63
+ return res
64
+ raise ValueError("No resource named %r!" % name)
65
+
66
+
67
+ class Dataset:
68
+ """A container for entities, often from one source or related to one topic.
69
+ A dataset is a set of data, sez W3C."""
70
+
71
+ Model = DatasetModel
72
+
73
+ def __init__(self: Self, data: Dict[str, Any]) -> None:
74
+ self.model = self.Model.model_validate(data)
75
+ self.name = self.model.name
76
+ self.children: Set[Self] = set()
77
+
78
+ @cached_property
79
+ def is_collection(self: Self) -> bool:
80
+ return len(self.model.children) > 0
81
+
82
+ @property
83
+ def datasets(self: Self) -> Set[Self]:
84
+ current: Set[Self] = set([self])
85
+ for child in self.children:
86
+ current.update(child.datasets)
87
+ return current
88
+
89
+ @property
90
+ def dataset_names(self: Self) -> List[str]:
91
+ return [d.name for d in self.datasets]
92
+
93
+ @property
94
+ def leaves(self: Self) -> Set[Self]:
95
+ """All contained datasets which are not collections (can be 'self')."""
96
+ return set([d for d in self.datasets if not d.is_collection])
97
+
98
+ @property
99
+ def leaf_names(self: Self) -> Set[str]:
100
+ return {d.name for d in self.leaves}
101
+
102
+ def __hash__(self) -> int:
103
+ return hash(repr(self))
104
+
105
+ def __repr__(self) -> str:
106
+ if not hasattr(self, "name"):
107
+ return "<Dataset>"
108
+ return f"<Dataset({self.name})>" # pragma: no cover
109
+
110
+ def get_resource(self, name: str) -> DataResource:
111
+ for res in self.model.resources:
112
+ if res.name == name:
113
+ return res
114
+ raise ValueError("No resource named %r!" % name)
115
+
116
+ def to_dict(self) -> Dict[str, Any]:
117
+ """Convert the dataset to a dictionary representation."""
118
+ return self.model.model_dump(mode="json", exclude_none=True)
119
+
120
+ @classmethod
121
+ def from_path(
122
+ cls: Type[DS], path: PathLike, catalog: Optional["DataCatalog[DS]"] = None
123
+ ) -> DS:
124
+ from followthemoney.dataset.catalog import DataCatalog
125
+
126
+ with open(path, "r") as fh:
127
+ data = yaml.safe_load(fh)
128
+ if catalog is None:
129
+ catalog = DataCatalog(cls, {})
130
+ return catalog.make_dataset(data)
131
+
132
+ @classmethod
133
+ def make(cls: Type[DS], data: Dict[str, Any]) -> DS:
134
+ from followthemoney.dataset.catalog import DataCatalog
135
+
136
+ catalog = DataCatalog(cls, {})
137
+ return catalog.make_dataset(data)
138
+
139
+ def __eq__(self, other: Any) -> bool:
140
+ try:
141
+ return not not self.name == other.name
142
+ except AttributeError:
143
+ return False
144
+
145
+ def __lt__(self, other: Any) -> bool:
146
+ return self.name.__lt__(other.name)
@@ -0,0 +1,25 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from followthemoney.dataset.util import CountryCode, Url
6
+ from followthemoney.types import registry
7
+
8
+
9
+ class DataPublisher(BaseModel):
10
+ """Publisher information, eg. the government authority."""
11
+
12
+ name: str
13
+ url: Optional[Url] = None
14
+ name_en: Optional[str] = None
15
+ acronym: Optional[str] = None
16
+ description: Optional[str] = None
17
+ country: Optional[CountryCode] = None
18
+ official: Optional[bool] = False
19
+ logo_url: Optional[Url] = None
20
+
21
+ @property
22
+ def country_label(self) -> Optional[str]:
23
+ if self.country is None:
24
+ return None
25
+ return registry.country.caption(self.country)
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+ from pydantic import BaseModel, field_validator
3
+
4
+ from followthemoney.dataset.util import Url, DateTimeISO
5
+ from followthemoney.types import registry
6
+
7
+
8
+ class DataResource(BaseModel):
9
+ """A downloadable resource that is part of a dataset."""
10
+
11
+ name: str
12
+ url: Optional[Url] = None
13
+ checksum: Optional[str] = None
14
+ timestamp: Optional[DateTimeISO] = None
15
+ mime_type: Optional[str] = None
16
+ title: Optional[str] = None
17
+ size: Optional[int] = None
18
+
19
+ @field_validator("mime_type", mode="after")
20
+ @classmethod
21
+ def ensure_mime_type(cls, value: str) -> Optional[str]:
22
+ if not registry.mimetype.validate(value):
23
+ raise ValueError(f"Invalid MIME type: {value!r}")
24
+ return value
25
+
26
+ @property
27
+ def mime_type_label(self) -> Optional[str]:
28
+ if self.mime_type is None:
29
+ return None
30
+ return registry.mimetype.caption(self.mime_type)
@@ -0,0 +1,55 @@
1
+ from datetime import datetime
2
+ from normality import slugify
3
+ from typing import Annotated, Any
4
+ from rigour.time import datetime_iso
5
+ from pydantic import AfterValidator, BeforeValidator, HttpUrl, PlainSerializer
6
+
7
+ from followthemoney.types import registry
8
+
9
+
10
+ def dataset_name_check(value: str) -> str:
11
+ """Check that the given value is a valid dataset name. This doesn't convert
12
+ or clean invalid names, but raises an error if they are not compliant to
13
+ force the user to fix an invalid name"""
14
+ if slugify(value, sep="_") != value:
15
+ raise ValueError("Invalid %s: %r" % ("dataset name", value))
16
+ return value
17
+
18
+
19
+ def type_check_date(value: Any) -> str:
20
+ """Check that the given value is a valid date string."""
21
+ cleaned = registry.date.clean(value)
22
+ if cleaned is None:
23
+ raise ValueError("Invalid date: %r" % value)
24
+ return cleaned
25
+
26
+
27
+ PartialDate = Annotated[str, BeforeValidator(type_check_date)]
28
+
29
+
30
+ def type_check_country(value: Any) -> str:
31
+ """Check that the given value is a valid country code."""
32
+ cleaned = registry.country.clean(value)
33
+ if cleaned is None:
34
+ raise ValueError("Invalid country code: %r" % value)
35
+ return cleaned
36
+
37
+
38
+ CountryCode = Annotated[str, BeforeValidator(type_check_country)]
39
+
40
+
41
+ def type_check_http_url(v: str) -> str:
42
+ url = HttpUrl(v)
43
+ return str(url)
44
+
45
+
46
+ Url = Annotated[str, AfterValidator(type_check_http_url)]
47
+
48
+
49
+ def serialize_dt(dt: datetime) -> str:
50
+ text = datetime_iso(dt)
51
+ assert text is not None, "Invalid datetime: %r" % dt
52
+ return text
53
+
54
+
55
+ DateTimeISO = Annotated[datetime, PlainSerializer(serialize_dt)]