PyPI - followthemoney - Versions diffs - 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl - Mend

followthemoney 4.3.4py3-none-any.whl → 4.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

followthemoney/__init__.py +5 -4
followthemoney/cli/statement.py +13 -7
followthemoney/cli/util.py +3 -3
followthemoney/compare.py +6 -19
followthemoney/dataset/__init__.py +2 -2
followthemoney/dataset/dataset.py +20 -0
followthemoney/entity.py +14 -0
followthemoney/mapping/csv.py +3 -1
followthemoney/model.py +4 -5
followthemoney/proxy.py +27 -3
followthemoney/schema/Company.yaml +1 -0
followthemoney/schema/CryptoWallet.yaml +4 -0
followthemoney/schema/Image.yaml +7 -0
followthemoney/schema/LegalEntity.yaml +7 -0
followthemoney/schema/Organization.yaml +1 -0
followthemoney/schema/Person.yaml +2 -1
followthemoney/schema/PublicBody.yaml +1 -0
followthemoney/settings.py +19 -0
followthemoney/statement/entity.py +39 -10
followthemoney/statement/serialize.py +23 -14
followthemoney/statement/statement.py +151 -42
followthemoney/statement/util.py +21 -0
followthemoney/types/country.py +16 -1
followthemoney/types/date.py +10 -0
followthemoney/types/language.py +1 -1
followthemoney/util.py +6 -14
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/METADATA +3 -3
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/RECORD +31 -30
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/WHEEL +0 -0
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/entry_points.txt +0 -0
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/licenses/LICENSE +0 -0

followthemoney/__init__.py CHANGED Viewed

@@ -2,14 +2,14 @@ from followthemoney.entity import ValueEntity, VE
 from followthemoney.model import Model
 from followthemoney.schema import Schema
 from followthemoney.property import Property
-from followthemoney.types import registry
+from followthemoney.types import registry, PropertyType
 from followthemoney.value import Value, Values
 from followthemoney.proxy import EntityProxy, E
 from followthemoney.statement import Statement, StatementEntity, SE
-from followthemoney.dataset import Dataset, DefaultDataset, DS
+from followthemoney.dataset import Dataset, UndefinedDataset, DS
 from followthemoney.util import set_model_locale
-__version__ = "4.3.4"
+__version__ = "4.5.1"
 # Data model singleton
 model = Model.instance()
@@ -20,13 +20,14 @@ __all__ = [
     "Model",
     "Schema",
     "Property",
+    "PropertyType",
     "Value",
     "Values",
     "EntityProxy",
     "E",
     "registry",
     "Dataset",
-    "DefaultDataset",
+    "UndefinedDataset",
     "DS",
     "Statement",
     "StatementEntity",

followthemoney/cli/statement.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import click
 from pathlib import Path
-from typing import Generator, List
+from typing import Generator, List, Optional
 from followthemoney.cli.cli import cli
 from followthemoney.cli.util import InPath, OutPath
 from followthemoney.cli.util import path_entities, write_entity, path_writer
-from followthemoney.dataset import Dataset, DefaultDataset
+from followthemoney.dataset import Dataset, UndefinedDataset
 from followthemoney.statement import Statement, StatementEntity
 from followthemoney.statement import FORMATS, CSV
 from followthemoney.statement import write_statements
@@ -16,12 +16,18 @@ from followthemoney.statement import read_path_statements
 @cli.command("statements", help="Export entities to statements")
 @click.argument("path", type=InPath)
 @click.option("-o", "--outpath", type=OutPath, default="-")
-@click.option("-d", "--dataset", type=str, required=True)
+@click.option("-d", "--dataset", type=str)
 @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
-def entity_statements(path: Path, outpath: Path, dataset: str, format: str) -> None:
+def entity_statements(
+    path: Path, outpath: Path, dataset: Optional[str], format: str
+) -> None:
     def make_statements() -> Generator[Statement, None, None]:
+        dataset_ = dataset or Dataset.UNDEFINED
         for entity in path_entities(path, StatementEntity):
-            yield from Statement.from_entity(entity, dataset=dataset)
+            for stmt in Statement.from_entity(entity, dataset=dataset_):
+                if dataset is not None:
+                    stmt = stmt.clone(dataset=dataset)
+                yield stmt
     with path_writer(outpath) as outfh:
         write_statements(outfh, format, make_statements())
@@ -43,12 +49,12 @@ def format_statements(
 @cli.command("aggregate-statements", help="Roll up statements into entities")
 @click.option("-i", "--infile", type=InPath, default="-")
 @click.option("-o", "--outpath", type=OutPath, default="-")
-@click.option("-d", "--dataset", type=str, default=DefaultDataset.name)
+@click.option("-d", "--dataset", type=str, default=UndefinedDataset.name)
 @click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
 def statements_aggregate(
     infile: Path, outpath: Path, dataset: str, format: str
 ) -> None:
-    dataset_ = Dataset.make({"name": dataset, "title": dataset})
+    dataset_ = Dataset.make({"name": dataset})
     with path_writer(outpath) as outfh:
         statements: List[Statement] = []
         for stmt in read_path_statements(infile, format=format):

followthemoney/cli/util.py CHANGED Viewed

@@ -6,7 +6,7 @@ import click
 import orjson
 from pathlib import Path
 from warnings import warn
-from typing import Any, BinaryIO, Generator, Optional, TextIO, Type
+from typing import Any, BinaryIO, Generator, List, Optional, TextIO, Type
 from banal import is_mapping, is_listish, ensure_list
 from followthemoney.export.common import Exporter
@@ -26,7 +26,7 @@ def write_object(stream: TextIO, obj: Any) -> None:
     stream.write(data + "\n")
-def write_entity(fh: BinaryIO, entity: E) -> None:
+def write_entity(fh: BinaryIO, entity: EntityProxy) -> None:
     data = entity.to_dict()
     entity_id = data.pop("id")
     assert entity_id is not None, data
@@ -131,7 +131,7 @@ def resolve_includes(file_path: PathLike, data: Any) -> Any:
     if is_listish(data):
         return [resolve_includes(file_path, i) for i in data]
     if is_mapping(data):
-        include_paths = ensure_list(data.pop("include", []))
+        include_paths: List[str] = ensure_list(data.pop("include", []))
         for include_path in include_paths:
             dir_prefix = os.path.dirname(file_path)
             include_path = os.path.join(dir_prefix, include_path)

followthemoney/compare.py CHANGED Viewed

@@ -71,31 +71,18 @@ def _compare(scores: Scores, weights: Weights, n_std: int = 1) -> float:
     return 1.0 / (1.0 + math.exp(-prob))
-def entity_is_same(left: EntityProxy, right: EntityProxy) -> bool:
-    """Check if two entities are the same apart from their ID."""
-    if left.schema != right.schema:
-        return False
-    props = set(left.properties.keys()).union(right.properties.keys())
-    if 0 == len(props):
-        return False
-    for prop in props:
-        left_vals = sorted(left.get(prop))
-        right_vals = sorted(right.get(prop))
-        if left_vals != right_vals:
-            return False
-    return True
 def compare(
     left: EntityProxy,
     right: EntityProxy,
     weights: Weights = COMPARE_WEIGHTS,
 ) -> float:
     """Compare two entities and return a match score."""
-    if entity_is_same(left, right):
-        return 1.0
+    if left.checksum == right.checksum:
+        # Check if there is any data at all (ie any basis for making a decision),
+        # if so, return a perfect match. This avoids marking two empty entities
+        # as matching. Bit ambiguous, but practical.
+        if len(left.properties) > 0 and len(right.properties) > 0:
+            return 1.0
     scores = compare_scores(left, right)
     return _compare(scores, weights)

followthemoney/dataset/__init__.py CHANGED Viewed

@@ -4,11 +4,11 @@ from followthemoney.dataset.resource import DataResource
 from followthemoney.dataset.publisher import DataPublisher
 from followthemoney.dataset.coverage import DataCoverage
-DefaultDataset = Dataset.make({"name": "default"})
+UndefinedDataset = Dataset.make({"name": Dataset.UNDEFINED})
 __all__ = [
     "Dataset",
-    "DefaultDataset",
+    "UndefinedDataset",
     "DataCatalog",
     "DataResource",
     "DataPublisher",

followthemoney/dataset/dataset.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 import yaml
 import logging
 from functools import cached_property
@@ -38,6 +39,8 @@ class DatasetModel(BaseModel):
     coverage: DataCoverage | None = None
     resources: List[DataResource] = []
     children: Set[str] = set()
+    deprecation: Optional[str] = None
+    deprecated: bool = False
     @field_validator("name", mode="after")
     @classmethod
@@ -57,6 +60,18 @@ class DatasetModel(BaseModel):
             data["children"] = children
         return data
+    @model_validator(mode="after")
+    def evaluate_data(self) -> "DatasetModel":
+        # derive deprecated from deprecation notice:
+        if self.deprecation is not None:
+            self.deprecation = self.deprecation.strip()
+            if not len(self.deprecation):
+                self.deprecation = None
+        self.deprecated = self.deprecation is not None or self.deprecated
+        if self.deprecated and (self.coverage is None or self.coverage.end is None):
+            raise ValueError("Deprecated dataset coverage must have an end date.")
+        return self
     def get_resource(self, name: str) -> DataResource:
         for res in self.resources:
             if res.name == name:
@@ -68,6 +83,8 @@ class Dataset:
     """A container for entities, often from one source or related to one topic.
     A dataset is a set of data, sez W3C."""
+    UNDEFINED = "undefined"
     def __init__(self: Self, data: Dict[str, Any]) -> None:
         self.model = DatasetModel.model_validate(data)
         self.name = self.model.name
@@ -121,10 +138,13 @@ class Dataset:
     ) -> DS:
         from followthemoney.dataset.catalog import DataCatalog
+        path = Path(path)
         with open(path, "r") as fh:
             data = yaml.safe_load(fh)
             if catalog is None:
                 catalog = DataCatalog(cls, {})
+            if "name" not in data:
+                data["name"] = path.stem
             return catalog.make_dataset(data)
     @classmethod

followthemoney/entity.py CHANGED Viewed

@@ -5,6 +5,7 @@ from rigour.names import pick_name
 from followthemoney.proxy import EntityProxy
 from followthemoney.schema import Schema
 from followthemoney.statement import BASE_ID, Statement
+from followthemoney.util import HASH_ENCODING
 VE = TypeVar("VE", bound="ValueEntity")
@@ -81,6 +82,19 @@ class ValueEntity(EntityProxy):
             merged.last_change = max(changed, default=None)
         return merged
+    @property
+    def checksum(self) -> str:
+        digest = self._checksum_digest()
+        for dataset in sorted(self.datasets):
+            digest.update(dataset.encode(HASH_ENCODING))
+            digest.update(b"\x1e")
+        for referent in sorted(self.referents):
+            digest.update(referent.encode(HASH_ENCODING))
+            digest.update(b"\x1e")
+        if self.last_change is not None:
+            digest.update(self.last_change.encode(HASH_ENCODING))
+        return digest.hexdigest()
     def to_dict(self) -> Dict[str, Any]:
         data = super().to_dict()
         data["referents"] = list(self.referents)

followthemoney/mapping/csv.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, cast
 from typing import Any, Dict, Generator, ItemsView, Iterable, List, Optional, Set, Tuple
 from followthemoney.mapping.source import Record, Source
+from followthemoney.settings import USER_AGENT
 from followthemoney.util import sanitize_text
 from followthemoney.exc import InvalidMapping
@@ -64,7 +65,8 @@ class CSVSource(Source):
         parsed_url = urlparse(url)
         log.info("Loading: %s", url)
         if parsed_url.scheme in ["http", "https"]:
-            res = requests.get(url, stream=True)
+            headers = {"User-Agent": USER_AGENT}
+            res = requests.get(url, stream=True, headers=headers)
             if not res.ok:
                 raise InvalidMapping("Failed to open CSV: %s" % url)
             # if res.encoding is None:

followthemoney/model.py CHANGED Viewed

@@ -3,12 +3,14 @@ import yaml
 from functools import cache
 from typing import TYPE_CHECKING, Any
 from typing import Dict, Generator, Iterator, Optional, Set, TypedDict, Union
+from rigour.env import ENCODING
 from followthemoney.types import registry
 from followthemoney.types.common import PropertyType, PropertyTypeToDict
 from followthemoney.schema import Schema, SchemaToDict
 from followthemoney.property import Property
 from followthemoney.exc import InvalidModel, InvalidData
+from followthemoney.settings import MODEL_PATH
 from followthemoney.util import const
 if TYPE_CHECKING:
@@ -47,10 +49,7 @@ class Model(object):
     @classmethod
     def instance(cls) -> "Model":
         if cls._instance is None:
-            model_path = os.path.dirname(__file__)
-            model_path = os.path.join(model_path, "schema")
-            model_path = os.environ.get("FTM_MODEL_PATH", model_path)
-            cls._instance = cls(model_path)
+            cls._instance = cls(MODEL_PATH)
         return cls._instance
     def generate(self) -> None:
@@ -68,7 +67,7 @@ class Model(object):
                     schema.properties[prop.name] = prop
     def _load(self, filepath: str) -> None:
-        with open(filepath, "r", encoding="utf-8") as fh:
+        with open(filepath, "r", encoding=ENCODING) as fh:
             data = yaml.safe_load(fh)
             if not isinstance(data, dict):
                 raise InvalidModel("Model file is not a mapping: %s" % filepath)

followthemoney/proxy.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import logging
 from typing import TYPE_CHECKING, cast, Any
 from typing import Dict, Generator, List, Optional, Set, Tuple, Union, Type, TypeVar
@@ -10,13 +11,14 @@ from followthemoney.types import registry
 from followthemoney.types.common import PropertyType
 from followthemoney.property import Property
 from followthemoney.value import string_list, Values
-from followthemoney.util import sanitize_text, gettext
+from followthemoney.util import HASH_ENCODING, sanitize_text, gettext
 from followthemoney.util import merge_context, make_entity_id
 from followthemoney.model import Model
 from followthemoney.schema import Schema
 if TYPE_CHECKING:
     from followthemoney.model import Model
+    from hashlib import _Hash
 log = logging.getLogger(__name__)
 P = Union[Property, str]
@@ -437,6 +439,28 @@ class EntityProxy(object):
             self.add(prop, values, cleaned=True, quiet=True)
         return self
+    def _checksum_digest(self) -> "_Hash":
+        """Create a SHA1 digest of the entity's ID, schema and properties for
+        change detection. This is returned as a hashlib digest object so that
+        it can be subclassed."""
+        digest = hashlib.sha1()
+        if self.id is not None:
+            digest.update(self.id.encode(HASH_ENCODING))
+        digest.update(self.schema.name.encode(HASH_ENCODING))
+        for prop in sorted(self._properties.keys()):
+            digest.update(prop.encode(HASH_ENCODING))
+            for value in sorted(self._properties[prop]):
+                digest.update(value.encode(HASH_ENCODING))
+                digest.update(b"\x1e")
+            digest.update(b"\x1f")
+        return digest
+    @property
+    def checksum(self) -> str:
+        """A SHA1 checksum hexdigest representing the current state of the
+        entity proxy. This can be used for change detection."""
+        return self._checksum_digest().hexdigest()
     def __getstate__(self) -> Dict[str, Any]:
         data = {slot: getattr(self, slot) for slot in self.__slots__}
         data["schema"] = self.schema.name
@@ -460,13 +484,13 @@ class EntityProxy(object):
     def __hash__(self) -> int:
         if self.id is None:
-            raise RuntimeError("Cannot hash entity without an ID")
+            raise RuntimeError("Unhashable entity proxy without ID.")
         return hash(self.id)
     def __eq__(self, other: Any) -> bool:
         try:
             if self.id is None or other.id is None:
-                raise RuntimeError("Cannot compare entities without IDs.")
+                raise RuntimeError("Cannot compare entity proxies without IDs.")
             return bool(self.id == other.id)
         except AttributeError:
             return False

followthemoney/schema/Company.yaml CHANGED Viewed

@@ -19,6 +19,7 @@ Company:
   caption:
     - name
     - alias
+    - abbreviation
     - weakAlias
     - previousName
     - registrationNumber

followthemoney/schema/CryptoWallet.yaml CHANGED Viewed

@@ -26,6 +26,10 @@ CryptoWallet:
       maxLength: 128
     privateKey:
       label: Private key
+    accountId:
+      label: Account ID
+      description: Platform-specific user/account identifier
+      type: identifier
     creationDate:
       label: Creation date
       type: date

followthemoney/schema/Image.yaml CHANGED Viewed

@@ -1,4 +1,7 @@
 Image:
+  # This schema defines an image file entity within the FollowTheMoney data model.
+  # If a `checksum` property is present, consider loading it from an Aleph archive
+  # or FtM data lake. Otherwise, use `sourceUrl` to fetch the image directly.
   extends:
     - Document
   label: Image
@@ -23,3 +26,7 @@ Image:
         label: "Images"
       type: entity
       range: Person
+    credit:
+      label: "Credit"
+      description: "The credit or attribution for the image."
+      type: string

followthemoney/schema/LegalEntity.yaml CHANGED Viewed

@@ -18,6 +18,7 @@ LegalEntity:
   caption:
     - name
     - alias
+    - abbreviation
     - weakAlias
     - previousName
     - email
@@ -29,6 +30,12 @@ LegalEntity:
     end:
       - dissolutionDate
   properties:
+    abbreviation:
+      label: Abbreviation
+      type: name
+      description: "Abbreviated name or acronym"
+      # TODO: is un-matchable wise? The idea is to handle it like `weakAlias` rather than `alias`.
+      matchable: false
     email:
       label: E-Mail
       type: email

followthemoney/schema/Organization.yaml CHANGED Viewed

@@ -18,6 +18,7 @@ Organization:
   caption:
     - name
     - alias
+    - abbreviation
     - weakAlias
     - previousName
     - registrationNumber

followthemoney/schema/Person.yaml CHANGED Viewed

@@ -15,8 +15,9 @@ Person:
   caption:
     - name
     - alias
-    - weakAlias
     - previousName
+    - weakAlias
+    - abbreviation
     - lastName
     - email
     - phone

followthemoney/schema/PublicBody.yaml CHANGED Viewed

@@ -14,6 +14,7 @@ PublicBody:
   caption:
     - name
     - alias
+    - abbreviation
     - weakAlias
     - previousName
   required:

followthemoney/settings.py ADDED Viewed

@@ -0,0 +1,19 @@
+import os
+import requests
+from typing import List
+from rigour.env import env_opt, env_str
+def get_env_list(name: str, default: List[str] = []) -> List[str]:
+    value = env_opt(name)
+    if value is not None:
+        values = value.split(":")
+        if len(values):
+            return values
+    return default
+MODEL_PATH = os.path.join(os.path.dirname(__file__), "schema")
+MODEL_PATH = env_str("FTM_MODEL_PATH", MODEL_PATH)
+USER_AGENT = env_str("FTM_USER_AGENT", requests.utils.default_user_agent())

followthemoney/statement/entity.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from hashlib import sha1
 from collections.abc import Mapping
-from typing import Any, Dict, List, Optional, Set, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type
 from typing import Generator, Iterable, Tuple, TypeVar
 from rigour.langs import LangStr
 from rigour.names.pick import pick_lang_name
@@ -10,17 +10,20 @@ from followthemoney.exc import InvalidData
 from followthemoney.schema import Schema
 from followthemoney.types.common import PropertyType
 from followthemoney.property import Property
-from followthemoney.util import gettext
+from followthemoney.util import HASH_ENCODING, gettext
 from followthemoney.proxy import P
 from followthemoney.types import registry
 from followthemoney.value import string_list, Values
 from followthemoney.proxy import EntityProxy
-from followthemoney.dataset import Dataset, DefaultDataset
+from followthemoney.dataset import Dataset, UndefinedDataset
 from followthemoney.statement.statement import Statement
 from followthemoney.statement.util import BASE_ID
 SE = TypeVar("SE", bound="StatementEntity")
+if TYPE_CHECKING:
+    from hashlib import _Hash
 class StatementEntity(EntityProxy):
     """An entity object that can link to a set of datasets that it is sourced from."""
@@ -35,7 +38,12 @@ class StatementEntity(EntityProxy):
         "_statements",
     )
-    def __init__(self, dataset: Dataset, data: Dict[str, Any], cleaned: bool = True):
+    def __init__(
+        self,
+        dataset: Dataset,
+        data: Dict[str, Any],
+        cleaned: bool = True,
+    ) -> None:
         data = dict(data or {})
         schema = Model.instance().get(data.pop("schema", None))
         if schema is None:
@@ -76,8 +84,7 @@ class StatementEntity(EntityProxy):
         for stmts in self._statements.values():
             for stmt in stmts:
                 if stmt.entity_id is None and self.id is not None:
-                    stmt.entity_id = self.id
-                    stmt.id = stmt.generate_key()
+                    stmt = stmt.clone(entity_id=self.id)
                 if stmt.id is None:
                     stmt.id = stmt.generate_key()
                 yield stmt
@@ -97,9 +104,9 @@ class StatementEntity(EntityProxy):
             if stmt.first_seen is not None:
                 first_seen.add(stmt.first_seen)
         if self.id is not None:
-            digest = sha1(self.schema.name.encode("utf-8"))
+            digest = sha1(self.schema.name.encode(HASH_ENCODING))
             for id in sorted(ids):
-                digest.update(id.encode("utf-8"))
+                digest.update(id.encode(HASH_ENCODING))
             checksum = digest.hexdigest()
             # This is to make the last_change value stable across
             # serialisation:
@@ -183,6 +190,11 @@ class StatementEntity(EntityProxy):
             return []
         return list(self._statements[prop_name])
+    @property
+    def has_statements(self) -> bool:
+        """Return whether the entity has any statements."""
+        return len(self._statements) > 0
     def set(
         self,
         prop: P,
@@ -426,7 +438,7 @@ class StatementEntity(EntityProxy):
                     origins.add(stmt.origin)
         data["referents"] = list(referents)
-        data["datasets"] = list(datasets)
+        data["datasets"] = [d for d in datasets if d != Dataset.UNDEFINED]
         if origins:
             data["origin"] = list(origins)
@@ -449,6 +461,23 @@ class StatementEntity(EntityProxy):
         data["statements"] = [stmt.to_dict() for stmt in self.statements]
         return data
+    def _checksum_digest(self) -> "_Hash":
+        """Create a SHA1 digest of the entity's ID, schema and properties for
+        change detection. This is returned as a hashlib digest object so that
+        it can be subclassed."""
+        digest = sha1()
+        if self.id is not None:
+            digest.update(self.id.encode(HASH_ENCODING))
+        statement_ids: List[str] = []
+        for stmts in self._statements.values():
+            for stmt in stmts:
+                if stmt.id is not None:
+                    statement_ids.append(stmt.id)
+        for stmt_id in sorted(statement_ids):
+            digest.update(stmt_id.encode(HASH_ENCODING))
+            digest.update(b"\x1e")
+        return digest
     def __len__(self) -> int:
         return len(list(self._iter_stmt())) + 1
@@ -460,7 +489,7 @@ class StatementEntity(EntityProxy):
         default_dataset: Optional[Dataset] = None,
     ) -> SE:
         # Exists only for backwards compatibility.
-        dataset = default_dataset or DefaultDataset
+        dataset = default_dataset or UndefinedDataset
         return cls(dataset, data, cleaned=cleaned)
     @classmethod

followthemoney 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl

followthemoney 4.3.4py3-none-any.whl → 4.5.1py3-none-any.whl