PyPI - followthemoney - Versions diffs - 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl - Mend

followthemoney 4.3.4py3-none-any.whl → 4.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

followthemoney/__init__.py +5 -4
followthemoney/cli/statement.py +13 -7
followthemoney/cli/util.py +3 -3
followthemoney/compare.py +6 -19
followthemoney/dataset/__init__.py +2 -2
followthemoney/dataset/dataset.py +20 -0
followthemoney/entity.py +14 -0
followthemoney/mapping/csv.py +3 -1
followthemoney/model.py +4 -5
followthemoney/proxy.py +27 -3
followthemoney/schema/Company.yaml +1 -0
followthemoney/schema/CryptoWallet.yaml +4 -0
followthemoney/schema/Image.yaml +7 -0
followthemoney/schema/LegalEntity.yaml +7 -0
followthemoney/schema/Organization.yaml +1 -0
followthemoney/schema/Person.yaml +2 -1
followthemoney/schema/PublicBody.yaml +1 -0
followthemoney/settings.py +19 -0
followthemoney/statement/entity.py +39 -10
followthemoney/statement/serialize.py +23 -14
followthemoney/statement/statement.py +151 -42
followthemoney/statement/util.py +21 -0
followthemoney/types/country.py +16 -1
followthemoney/types/date.py +10 -0
followthemoney/types/language.py +1 -1
followthemoney/util.py +6 -14
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/METADATA +3 -3
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/RECORD +31 -30
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/WHEEL +0 -0
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/entry_points.txt +0 -0
{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/licenses/LICENSE +0 -0

followthemoney/statement/serialize.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import csv
+import sys
 import click
 import orjson
 import logging
 from io import TextIOWrapper
 from pathlib import Path
 from types import TracebackType
-from typing import cast
+from typing import Dict, Tuple, cast
 from typing import BinaryIO, Generator, Iterable, List, Optional, TextIO, Type
 from rigour.boolean import text_bool
+from rigour.env import ENCODING
 from followthemoney.statement.statement import Statement, StatementDict
 from followthemoney.statement.util import unpack_prop
@@ -48,6 +50,7 @@ LEGACY_PACK_COLUMNS = [
     "first_seen",
     "last_seen",
 ]
+csv.field_size_limit(sys.maxsize)
 def read_json_statements(
@@ -60,7 +63,7 @@ def read_json_statements(
 def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
-    wrapped = TextIOWrapper(fh, encoding="utf-8")
+    wrapped = TextIOWrapper(fh, encoding=ENCODING)
     for row in csv.DictReader(wrapped, dialect=csv.unix_dialect):
         data = cast(StatementDict, row)
         data["external"] = text_bool(row.get("external")) or False
@@ -68,11 +71,13 @@ def read_csv_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
             data["lang"] = None
         if row.get("original_value") == "":
             data["original_value"] = None
+        if row.get("origin") == "":
+            data["origin"] = None
         yield Statement.from_dict(data)
 def read_pack_statements(fh: BinaryIO) -> Generator[Statement, None, None]:
-    wrapped = TextIOWrapper(fh, encoding="utf-8")
+    wrapped = TextIOWrapper(fh, encoding=ENCODING)
     yield from read_pack_statements_decoded(wrapped)
@@ -100,7 +105,7 @@ def read_pack_statements_decoded(fh: TextIO) -> Generator[Statement, None, None]
             dataset=data["dataset"],
             lang=data["lang"] or None,
             original_value=data["original_value"] or None,
-            origin=data.get("origin"),
+            origin=data.get("origin") or None,
             first_seen=data["first_seen"],
             external=data["external"] == "t",
             canonical_id=data["entity_id"],
@@ -129,10 +134,10 @@ def read_path_statements(path: Path, format: str) -> Generator[Statement, None,
 def get_statement_writer(fh: BinaryIO, format: str) -> "StatementWriter":
     if format == CSV:
-        wrapped = TextIOWrapper(fh, encoding="utf-8")
+        wrapped = TextIOWrapper(fh, encoding=ENCODING)
         return CSVStatementWriter(wrapped)
     elif format == PACK:
-        wrapped = TextIOWrapper(fh, encoding="utf-8")
+        wrapped = TextIOWrapper(fh, encoding=ENCODING)
         return PackStatementWriter(wrapped)
     elif format == JSON:
         return JSONStatementWriter(fh)
@@ -222,12 +227,14 @@ class PackStatementWriter(StatementWriter):
             "id",
         ]
         self.writer.writerow(columns)
-        self._batch: List[List[Optional[str]]] = []
+        self._batch: Dict[str, Tuple[Optional[str], ...]] = {}
     def write(self, stmt: Statement) -> None:
         # HACK: This is very similar to the CSV writer, but at the very inner
         # loop of the application, so we're duplicating code here.
-        row = [
+        if stmt.id is None:
+            raise RuntimeError("Cannot write pack statement without ID")
+        row = (
             stmt.entity_id,
             f"{stmt.schema}:{stmt.prop}",
             stmt.value,
@@ -239,13 +246,15 @@ class PackStatementWriter(StatementWriter):
             stmt.first_seen,
             stmt.last_seen,
             stmt.id,
-        ]
-        self._batch.append(row)
+        )
+        self._batch[stmt.id] = row
         if len(self._batch) >= CSV_BATCH:
-            self.writer.writerows(self._batch)
-            self._batch.clear()
+            self.flush()
+    def flush(self) -> None:
+        self.writer.writerows(self._batch.values())
+        self._batch.clear()
     def close(self) -> None:
-        if len(self._batch) > 0:
-            self.writer.writerows(self._batch)
+        self.flush()
         self.fh.close()

followthemoney/statement/statement.py CHANGED Viewed

@@ -1,14 +1,22 @@
 import hashlib
 import warnings
 from sqlalchemy.engine import Row
-from typing import cast
-from typing import Any, Dict, Generator, Optional
+from typing import Union, cast
+from typing import Any, Dict, Generator, Optional, TypeGuard
 from typing_extensions import TypedDict, Self
 from rigour.time import datetime_iso, iso_datetime
 from rigour.boolean import bool_text
 from followthemoney.proxy import EntityProxy
-from followthemoney.statement.util import get_prop_type, BASE_ID
+from followthemoney.statement.util import get_prop_type, BASE_ID, NON_LANG_TYPE_NAMES
+from followthemoney.util import HASH_ENCODING
+UNSET = object()
+def is_not_unset(value: str | None | object) -> TypeGuard[str | None]:
+    return value is not UNSET
 class StatementDict(TypedDict):
@@ -42,15 +50,16 @@ class Statement(object):
     __slots__ = [
         "id",
-        "entity_id",
+        "_entity_id",
         "canonical_id",
-        "prop",
-        "schema",
-        "value",
-        "dataset",
-        "lang",
+        "_prop",
+        "_schema",
+        "_value",
+        "_dataset",
+        "_lang",
+        "prop_type",
         "original_value",
-        "external",
+        "_external",
         "first_seen",
         "last_seen",
         "origin",
@@ -72,55 +81,95 @@ class Statement(object):
         last_seen: Optional[str] = None,
         origin: Optional[str] = None,
     ):
-        self.entity_id = entity_id
+        self._entity_id = entity_id
         self.canonical_id = canonical_id or entity_id
-        self.prop = prop
-        self.schema = schema
-        self.value = value
-        self.dataset = dataset
-        self.lang = lang
+        self._prop = prop
+        self._schema = schema
+        self.prop_type = get_prop_type(schema, prop)
+        self._value = value
+        self._dataset = dataset
+        # Remove lang for non-linguistic property types. The goal here is to avoid
+        # duplicate statements because of language tags, but the language metadata
+        # may be relevant as context for how the original_value was parsed so it's
+        # a bit of information loss.
+        if lang is not None:
+            if self.prop_type in NON_LANG_TYPE_NAMES:
+                lang = None
+        self._lang = lang
         self.original_value = original_value
         self.first_seen = first_seen
         self.last_seen = last_seen or first_seen
-        self.external = external
+        self._external = external
         self.origin = origin
         if id is None:
             id = self.generate_key()
         self.id = id
     @property
-    def prop_type(self) -> str:
-        """The type of the property, e.g. 'string', 'number', 'url'."""
-        return get_prop_type(self.schema, self.prop)
+    def entity_id(self) -> str:
+        """The (original) ID of the entity this statement is about."""
+        return self._entity_id
+    @property
+    def dataset(self) -> str:
+        """The dataset this statement was observed in."""
+        return self._dataset
+    @property
+    def prop(self) -> str:
+        """The property name this statement is about."""
+        return self._prop
+    @property
+    def schema(self) -> str:
+        """The schema of the entity this statement is about."""
+        return self._schema
+    @property
+    def value(self) -> str:
+        """The value of the property captured by this statement."""
+        return self._value
+    @property
+    def lang(self) -> Optional[str]:
+        """The language of the property value, if applicable."""
+        return self._lang
+    @property
+    def external(self) -> bool:
+        """Whether this statement was observed in an external dataset."""
+        return self._external
     def to_dict(self) -> StatementDict:
         return {
             "canonical_id": self.canonical_id,
-            "entity_id": self.entity_id,
-            "prop": self.prop,
-            "schema": self.schema,
-            "value": self.value,
-            "dataset": self.dataset,
-            "lang": self.lang,
+            "entity_id": self._entity_id,
+            "prop": self._prop,
+            "schema": self._schema,
+            "value": self._value,
+            "dataset": self._dataset,
+            "lang": self._lang,
             "original_value": self.original_value,
             "first_seen": self.first_seen,
             "last_seen": self.last_seen,
-            "external": self.external,
+            "external": self._external,
             "origin": self.origin,
             "id": self.id,
         }
     def to_csv_row(self) -> Dict[str, Optional[str]]:
         data = cast(Dict[str, Optional[str]], self.to_dict())
-        data["external"] = bool_text(self.external)
-        data["prop_type"] = get_prop_type(self.schema, self.prop)
+        data["external"] = bool_text(self._external)
+        data["prop_type"] = self.prop_type
         return data
     def to_db_row(self) -> Dict[str, Any]:
         data = cast(Dict[str, Any], self.to_dict())
         data["first_seen"] = iso_datetime(self.first_seen)
         data["last_seen"] = iso_datetime(self.last_seen)
-        data["prop_type"] = get_prop_type(self.schema, self.prop)
+        data["prop_type"] = self.prop_type
         return data
     def __hash__(self) -> int:
@@ -132,27 +181,83 @@ class Statement(object):
         return hash(self.id)
     def __repr__(self) -> str:
-        return "<Statement(%r, %r, %r)>" % (self.entity_id, self.prop, self.value)
+        return "<Statement(%r, %r, %r)>" % (self._entity_id, self._prop, self._value)
     def __eq__(self, other: Any) -> bool:
         return not self.id != other.id
     def __lt__(self, other: Any) -> bool:
-        self_key = (self.prop != BASE_ID, self.id or "")
-        other_key = (other.prop != BASE_ID, other.id or "")
+        self_key = (self._prop != BASE_ID, self.id or "")
+        other_key = (other._prop != BASE_ID, other.id or "")
         return self_key < other_key
-    def clone(self: Self) -> "Statement":
+    def clone(
+        self: Self,
+        *,
+        entity_id: Optional[str] = None,
+        prop: Optional[str] = None,
+        schema: Optional[str] = None,
+        value: Optional[str] = None,
+        dataset: Optional[str] = None,
+        lang: Union[str, None, object] = UNSET,
+        original_value: Union[str, None, object] = UNSET,
+        first_seen: Union[str, None, object] = UNSET,
+        external: Optional[bool] = None,
+        canonical_id: Optional[str] = None,
+        last_seen: Union[str, None, object] = UNSET,
+        origin: Union[str, None, object] = UNSET,
+    ) -> "Statement":
         """Make a deep copy of the given statement."""
-        return Statement.from_dict(self.to_dict())
+        lang = lang if is_not_unset(lang) else self._lang
+        ov = original_value if is_not_unset(original_value) else self.original_value
+        first_seen = first_seen if is_not_unset(first_seen) else self.first_seen
+        last_seen = last_seen if is_not_unset(last_seen) else self.last_seen
+        origin = origin if is_not_unset(origin) else self.origin
+        if external is None:
+            external = self._external
+        if canonical_id is None and self._entity_id != self.canonical_id:
+            canonical_id = self.canonical_id
+        # Decide if the statement ID can be kept the same:
+        stmt_id = self.id
+        if entity_id is not None and entity_id != self.entity_id:
+            stmt_id = None
+        if prop is not None and prop != self._prop:
+            stmt_id = None
+        if schema is not None and schema != self._schema:
+            stmt_id = None
+        if value is not None and value != self._value:
+            stmt_id = None
+        if dataset is not None and dataset != self._dataset:
+            stmt_id = None
+        if external != self._external:
+            stmt_id = None
+        if lang != self._lang:
+            stmt_id = None
+        return Statement(
+            id=stmt_id,
+            entity_id=entity_id or self._entity_id,
+            prop=prop or self._prop,
+            schema=schema or self._schema,
+            value=value or self._value,
+            dataset=dataset or self._dataset,
+            lang=lang,
+            original_value=ov,
+            first_seen=first_seen,
+            external=external,
+            canonical_id=canonical_id,
+            last_seen=last_seen,
+            origin=origin,
+        )
     def generate_key(self) -> Optional[str]:
         return self.make_key(
-            self.dataset,
-            self.entity_id,
-            self.prop,
-            self.value,
-            self.external,
+            self._dataset,
+            self._entity_id,
+            self._prop,
+            self._value,
+            self._external,
+            lang=self._lang,
         )
     @classmethod
@@ -163,17 +268,21 @@ class Statement(object):
         prop: str,
         value: str,
         external: Optional[bool],
+        lang: Optional[str] = None,
     ) -> Optional[str]:
         """Hash the key properties of a statement record to make a unique ID."""
         if prop is None or value is None:
             return None
-        key = f"{dataset}.{entity_id}.{prop}.{value}"
+        if lang is None:
+            key = f"{dataset}.{entity_id}.{prop}.{value}"
+        else:
+            key = f"{dataset}.{entity_id}.{prop}.{value}@{lang}"
         if external:
             # We consider the external flag in key composition to avoid race conditions
             # where a certain entity might be emitted as external while it is already
             # linked in to the graph via another route.
             key = f"{key}.ext"
-        return hashlib.sha1(key.encode("utf-8")).hexdigest()
+        return hashlib.sha1(key.encode(HASH_ENCODING)).hexdigest()
     @classmethod
     def from_dict(cls, data: StatementDict) -> "Statement":

followthemoney/statement/util.py CHANGED Viewed

@@ -2,10 +2,31 @@ from functools import cache
 from typing import Tuple
 from followthemoney.model import Model
+from followthemoney.types import registry
 from followthemoney.util import const
 BASE_ID = "id"
+# Some property types should not set the `lang` attribute on statements.
+# These are typically non-linguistic types, although there's an argument
+# that language metadata could be useful for dates and countries, where
+# text parsing is likely to have taken place.
+NON_LANG_TYPE_NAMES = {
+    registry.entity.name,
+    registry.date.name,
+    registry.checksum.name,
+    registry.email.name,
+    registry.phone.name,
+    registry.gender.name,
+    registry.mimetype.name,
+    registry.topic.name,
+    registry.url.name,
+    registry.country.name,
+    registry.language.name,
+    registry.ip.name,
+    BASE_ID,
+}
 def pack_prop(schema: str, prop: str) -> str:
     return f"{schema}:{prop}"

followthemoney/types/country.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from typing import Optional, TYPE_CHECKING
+from typing import Callable, Optional, TYPE_CHECKING, Sequence
 from babel.core import Locale
 from rigour.territories import get_ftm_countries, lookup_territory
+from rigour.territories import territories_intersect
 from followthemoney.types.common import EnumType, EnumValues
 from followthemoney.util import defer as _
@@ -25,6 +26,20 @@ class CountryType(EnumType):
     def _locale_names(self, locale: Locale) -> EnumValues:
         return {t.code: t.name for t in get_ftm_countries()}
+    def compare(self, left: str, right: str) -> float:
+        overlap = territories_intersect([left], [right])
+        return 1.0 if len(overlap) else 0.0
+    def compare_sets(
+        self,
+        left: Sequence[str],
+        right: Sequence[str],
+        func: Callable[[Sequence[float]], float] = max,
+    ) -> float:
+        """Compare two sets of values and select the highest-scored result."""
+        overlap = territories_intersect(left, right)
+        return 1.0 if len(overlap) else 0.0
     def clean_text(
         self,
         text: str,

followthemoney/types/date.py CHANGED Viewed

@@ -27,6 +27,16 @@ class DateType(PropertyType):
     matchable = True
     max_length = 32
+    HISTORIC = "1001-01-01"
+    """A sentinel date value representing a very old date, used to indicate historic (and often imprecise) dates
+    that can be assumed to be long in the past."""
+    RELEVANCE_MIN = "1900-01-01"
+    """A cutoff date value representing the minimum relevant date for modern fincrime applications."""
+    RELEVANCE_MAX = "2100-12-31"
+    """A cutoff date value representing the maximum relevant date for modern fincrime applications."""
     def validate(
         self, value: str, fuzzy: bool = False, format: Optional[str] = None
     ) -> bool:

followthemoney/types/language.py CHANGED Viewed

@@ -4,7 +4,7 @@ from rigour.langs import iso_639_alpha3
 from followthemoney.types.common import EnumType, EnumValues
 from followthemoney.util import defer as _, gettext
-from followthemoney.util import get_env_list
+from followthemoney.settings import get_env_list
 if TYPE_CHECKING:
     from followthemoney.proxy import EntityProxy

followthemoney/util.py CHANGED Viewed

@@ -10,10 +10,11 @@ from threading import local
 from typing import cast, Dict, Any, List, Optional, TypeVar, Union
 from normality import stringify
 from normality.cleaning import remove_unsafe_chars
-from normality.encoding import DEFAULT_ENCODING
+from rigour.env import ENCODING
 from banal import is_mapping, unique_list, ensure_list
 MEGABYTE = 1024 * 1024
+HASH_ENCODING = "utf-8"
 DEFAULT_LOCALE = "en"
 ENTITY_ID_LEN = 200
@@ -55,16 +56,7 @@ def get_locale() -> Locale:
     return Locale.parse(state.locale)
-def get_env_list(name: str, default: List[str] = []) -> List[str]:
-    value = stringify(os.environ.get(name))
-    if value is not None:
-        values = value.split(":")
-        if len(values):
-            return values
-    return default
-def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]:
+def sanitize_text(value: Any, encoding: str = ENCODING) -> Optional[str]:
     text = stringify(value, encoding_default=encoding)
     if text is None:
         return None
@@ -74,8 +66,8 @@ def sanitize_text(value: Any, encoding: str = DEFAULT_ENCODING) -> Optional[str]
         log.warning("Cannot NFC text: %s", ex)
         return None
     text = remove_unsafe_chars(text)
-    byte_text = text.encode(DEFAULT_ENCODING, "replace")
-    text = byte_text.decode(DEFAULT_ENCODING, "replace")
+    byte_text = text.encode("utf-8", "replace")
+    text = byte_text.decode("utf-8", "replace")
     if len(text) == 0:
         return None
     return text
@@ -88,7 +80,7 @@ def key_bytes(key: Any) -> bytes:
     text = stringify(key)
     if text is None:
         return b""
-    return text.encode("utf-8")
+    return text.encode(ENCODING)
 def join_text(*parts: Any, sep: str = " ") -> Optional[str]:

{followthemoney-4.3.4.dist-info → followthemoney-4.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: followthemoney
-Version: 4.3.4
+Version: 4.5.1
 Summary: A data model for anti corruption data modeling and analysis.
 Project-URL: Documentation, https://followthemoney.tech/
 Project-URL: Repository, https://github.com/opensanctions/followthemoney.git
@@ -48,9 +48,9 @@ Requires-Dist: prefixdate<1.0.0,>=0.5.0
 Requires-Dist: pydantic<3.0.0,>=2.11.0
 Requires-Dist: pytz>=2021.1
 Requires-Dist: pyyaml<7.0.0,>=5.0.0
-Requires-Dist: rdflib<7.5.0,>=6.2.0
+Requires-Dist: rdflib<7.6.0,>=6.2.0
 Requires-Dist: requests<3.0.0,>=2.21.0
-Requires-Dist: rigour<2.0.0,>=1.4.0
+Requires-Dist: rigour<2.0.0,>=1.6.0
 Requires-Dist: sqlalchemy[mypy]<3.0.0,>=2.0.0
 Provides-Extra: dev
 Requires-Dist: build; extra == 'dev'

followthemoney 4.3.4__py3-none-any.whl → 4.5.1__py3-none-any.whl

followthemoney 4.3.4py3-none-any.whl → 4.5.1py3-none-any.whl