ftmq 4.3.2__tar.gz → 4.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {ftmq-4.3.2 → ftmq-4.5.2}/PKG-INFO +10 -11
  2. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/__init__.py +1 -1
  3. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/cli.py +1 -1
  4. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/io.py +3 -4
  5. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/model/dataset.py +13 -5
  6. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/model/mixins.py +1 -1
  7. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/model/stats.py +2 -1
  8. ftmq-4.5.2/ftmq/py.typed +0 -0
  9. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/base.py +13 -6
  10. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/lake.py +110 -49
  11. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/sql.py +7 -5
  12. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/util.py +19 -24
  13. {ftmq-4.3.2 → ftmq-4.5.2}/pyproject.toml +14 -14
  14. ftmq-4.3.2/ftmq/logging.py +0 -105
  15. {ftmq-4.3.2 → ftmq-4.5.2}/LICENSE +0 -0
  16. {ftmq-4.3.2 → ftmq-4.5.2}/NOTICE +0 -0
  17. {ftmq-4.3.2 → ftmq-4.5.2}/README.md +0 -0
  18. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/aggregate.py +0 -0
  19. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/aggregations.py +0 -0
  20. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/enums.py +0 -0
  21. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/filters.py +0 -0
  22. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/model/__init__.py +0 -0
  23. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/model/entity.py +0 -0
  24. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/query.py +0 -0
  25. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/similar.py +0 -0
  26. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/sql.py +0 -0
  27. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/__init__.py +0 -0
  28. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/aleph.py +0 -0
  29. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/fragments/__init__.py +0 -0
  30. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/fragments/dataset.py +0 -0
  31. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/fragments/loader.py +0 -0
  32. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/fragments/settings.py +0 -0
  33. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/fragments/store.py +0 -0
  34. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/fragments/utils.py +0 -0
  35. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/level.py +0 -0
  36. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/memory.py +0 -0
  37. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/store/redis.py +0 -0
  38. {ftmq-4.3.2 → ftmq-4.5.2}/ftmq/types.py +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ftmq
3
- Version: 4.3.2
3
+ Version: 4.5.2
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
6
  License-File: LICENSE
7
7
  License-File: NOTICE
8
8
  Author: Simon Wörpel
9
9
  Author-email: simon.woerpel@pm.me
10
- Requires-Python: >=3.11,<4
10
+ Requires-Python: >=3.11,<3.14
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Operating System :: OS Independent
13
13
  Classifier: Programming Language :: Python
@@ -21,25 +21,24 @@ Provides-Extra: postgres
21
21
  Provides-Extra: redis
22
22
  Provides-Extra: sql
23
23
  Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
24
- Requires-Dist: anystore (>=0.4.0,<0.5.0)
24
+ Requires-Dist: anystore (>=1.0.0,<2.0.0)
25
25
  Requires-Dist: click (>=8.2.1,<9.0.0)
26
26
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
27
- Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
28
- Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
27
+ Requires-Dist: deltalake (>=1.4.1,<2.0.0) ; extra == "lake"
28
+ Requires-Dist: duckdb (>=1.4.4,<2.0.0) ; extra == "lake"
29
29
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
30
- Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
30
+ Requires-Dist: followthemoney (>=4.5.1,<5.0.0)
31
31
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
32
- Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
32
+ Requires-Dist: nomenklatura (>=4.6.1,<5.0.0)
33
33
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
34
- Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
34
+ Requires-Dist: pandas (>=3.0.0,<4.0.0) ; extra == "lake"
35
35
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
36
36
  Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
37
- Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
38
- Requires-Dist: pycountry (>=24.6.1,<25.0.0)
37
+ Requires-Dist: pyarrow (>=23.0.0,<24.0.0) ; extra == "lake"
39
38
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
40
39
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
41
40
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
42
- Requires-Dist: rigour (>=1.4.1,<2.0.0)
41
+ Requires-Dist: rigour (>=1.6.2,<2.0.0)
43
42
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
44
43
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
45
44
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.3.2"
5
+ __version__ = "4.5.2"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
@@ -2,13 +2,13 @@ from datetime import datetime
2
2
 
3
3
  import click
4
4
  from anystore.io import smart_write, smart_write_json, smart_write_model
5
+ from anystore.logging import configure_logging, get_logger
5
6
  from click_default_group import DefaultGroup
6
7
  from followthemoney import ValueEntity
7
8
  from nomenklatura import settings
8
9
 
9
10
  from ftmq.aggregate import aggregate
10
11
  from ftmq.io import smart_read_proxies, smart_write_proxies
11
- from ftmq.logging import configure_logging, get_logger
12
12
  from ftmq.model.dataset import Catalog, Dataset
13
13
  from ftmq.model.stats import Collector
14
14
  from ftmq.query import Query
@@ -1,11 +1,12 @@
1
1
  from typing import Any, Iterable, Type
2
2
 
3
3
  import orjson
4
- from anystore.io import Uri, smart_open, smart_stream
4
+ from anystore.io import smart_open, smart_stream
5
+ from anystore.logging import get_logger
6
+ from anystore.types import Uri
5
7
  from banal import is_listish
6
8
  from followthemoney import E, StatementEntity, ValueEntity
7
9
 
8
- from ftmq.logging import get_logger
9
10
  from ftmq.query import Query
10
11
  from ftmq.store import Store, get_store
11
12
  from ftmq.types import Entities, Entity
@@ -13,8 +14,6 @@ from ftmq.util import ensure_entity, make_entity
13
14
 
14
15
  log = get_logger(__name__)
15
16
 
16
- DEFAULT_MODE = "rb"
17
-
18
17
 
19
18
  def smart_get_store(uri: Uri, **kwargs) -> Store | None:
20
19
  try:
@@ -1,19 +1,22 @@
1
1
  from datetime import datetime
2
- from typing import Literal
2
+ from typing import Literal, TypeVar
3
3
 
4
4
  from anystore.io import logged_items
5
- from anystore.types import SDict
5
+ from anystore.types import HttpUrlStr, SDict
6
6
  from followthemoney.dataset import DataPublisher
7
7
  from followthemoney.dataset.dataset import DatasetModel as _DatasetModel
8
- from pydantic import AnyUrl, HttpUrl
8
+ from pydantic import AnyUrl
9
9
  from rigour.mime.types import FTM
10
10
 
11
11
  from ftmq.model.mixins import BaseModel
12
12
  from ftmq.model.stats import DatasetStats
13
13
  from ftmq.types import Entities
14
+ from ftmq.util import DEFAULT_DATASET
14
15
 
15
16
  ContentType = Literal["documents", "structured", "mixed"]
16
17
 
18
+ D = TypeVar("D", bound="Dataset")
19
+
17
20
 
18
21
  class Dataset(BaseModel, _DatasetModel):
19
22
  prefix: str | None = None
@@ -57,9 +60,9 @@ class Catalog(BaseModel):
57
60
  description: str | None = None
58
61
  maintainer: DataPublisher | None = None
59
62
  publisher: DataPublisher | None = None
60
- url: HttpUrl | None = None
63
+ url: HttpUrlStr | None = None
61
64
  uri: str | None = None
62
- logo_url: HttpUrl | None = None
65
+ logo_url: HttpUrlStr | None = None
63
66
  git_repo: AnyUrl | None = None
64
67
 
65
68
  def iterate(self) -> Entities:
@@ -70,3 +73,8 @@ class Catalog(BaseModel):
70
73
  def names(self) -> set[str]:
71
74
  """Get the names of all datasets in the catalog."""
72
75
  return {d.name for d in self.datasets}
76
+
77
+
78
+ def make_dataset(name: str = DEFAULT_DATASET, cls: type[D] = Dataset, **kwargs) -> D:
79
+ kwargs["title"] = kwargs.pop("title", name)
80
+ return cls(name=name, **kwargs)
@@ -1,4 +1,4 @@
1
- from anystore.mixins import BaseModel as _BaseModel
1
+ from anystore.model.base import BaseModel as _BaseModel
2
2
  from pydantic import Field
3
3
 
4
4
 
@@ -2,8 +2,9 @@ from collections import Counter
2
2
  from datetime import datetime
3
3
  from typing import Any
4
4
 
5
+ from anystore.model import BaseModel
5
6
  from followthemoney import model
6
- from pydantic import BaseModel, model_validator
7
+ from pydantic import model_validator
7
8
 
8
9
  from ftmq.types import Entities, Entity
9
10
  from ftmq.util import get_country_name, get_year_from_iso
File without changes
@@ -1,25 +1,26 @@
1
- from typing import Generator, Iterable
1
+ from typing import Generator, Generic, Iterable, TypeVar
2
2
  from urllib.parse import urlparse
3
3
 
4
4
  from anystore.functools import weakref_cache as cache
5
- from followthemoney import DefaultDataset
5
+ from anystore.logging import get_logger
6
6
  from followthemoney.dataset.dataset import Dataset
7
7
  from nomenklatura import store as nk
8
8
  from nomenklatura.db import get_engine
9
9
  from nomenklatura.resolver import Resolver
10
10
 
11
11
  from ftmq.aggregations import AggregatorResult
12
- from ftmq.logging import get_logger
13
12
  from ftmq.model.stats import Collector, DatasetStats
14
13
  from ftmq.query import Query
15
14
  from ftmq.similar import get_similar
16
15
  from ftmq.types import StatementEntities, StatementEntity
17
- from ftmq.util import ensure_dataset
16
+ from ftmq.util import DEFAULT_DATASET, ensure_dataset
18
17
 
19
18
  log = get_logger(__name__)
20
19
 
21
20
  DEFAULT_ORIGIN = "default"
22
21
 
22
+ V = TypeVar("V", bound="View")
23
+
23
24
 
24
25
  @cache
25
26
  def get_resolver(uri: str | None = None) -> Resolver[StatementEntity]:
@@ -28,7 +29,7 @@ def get_resolver(uri: str | None = None) -> Resolver[StatementEntity]:
28
29
  return Resolver.make_default(get_engine("sqlite:///:memory:"))
29
30
 
30
31
 
31
- class Store(nk.Store):
32
+ class Store(nk.Store[Dataset, StatementEntity], Generic[V]):
32
33
  """
33
34
  Feature add-ons to `nomenklatura.store.Store`
34
35
  """
@@ -51,7 +52,7 @@ class Store(nk.Store):
51
52
  linker = linker or get_resolver(kwargs.get("uri"))
52
53
  super().__init__(dataset=dataset, linker=linker, **kwargs)
53
54
  # implicit set all datasets as default store scope:
54
- if dataset == DefaultDataset and not dataset.leaf_names:
55
+ if dataset.name == DEFAULT_DATASET and not dataset.leaf_names:
55
56
  self.dataset = self.get_scope()
56
57
 
57
58
  def get_scope(self) -> Dataset:
@@ -60,6 +61,12 @@ class Store(nk.Store):
60
61
  """
61
62
  raise NotImplementedError
62
63
 
64
+ def view(self, scope: Dataset | None = None, external: bool = False) -> V:
65
+ raise NotImplementedError
66
+
67
+ def default_view(self, external: bool = False) -> V:
68
+ return self.view(self.dataset, external)
69
+
63
70
  def iterate(self, dataset: str | Dataset | None = None) -> StatementEntities:
64
71
  """
65
72
  Iterate all the entities, optional filter for a dataset.
@@ -19,16 +19,15 @@ Layout:
19
19
  """
20
20
 
21
21
  from pathlib import Path
22
- from typing import Any, Generator, Iterable
22
+ from typing import Any, Generator
23
23
  from urllib.parse import urlparse
24
24
 
25
25
  import duckdb
26
- import numpy as np
27
- import pandas as pd
26
+ import pyarrow as pa
28
27
  from anystore.functools import weakref_cache as cache
29
- from anystore.lock import Lock
28
+ from anystore.interface.lock import Lock
30
29
  from anystore.logging import get_logger
31
- from anystore.store.fs import Store as FSStore
30
+ from anystore.store import Store as FSStore
32
31
  from anystore.types import SDict
33
32
  from anystore.util import clean_dict
34
33
  from deltalake import (
@@ -39,6 +38,7 @@ from deltalake import (
39
38
  write_deltalake,
40
39
  )
41
40
  from deltalake._internal import TableNotFoundError
41
+ from deltalake.table import FilterConjunctionType
42
42
  from followthemoney import EntityProxy, StatementEntity, model
43
43
  from followthemoney.dataset.dataset import Dataset
44
44
  from followthemoney.statement import Statement
@@ -51,17 +51,19 @@ from sqlalchemy import Boolean, DateTime, column, select, table
51
51
  from sqlalchemy.sql import Select
52
52
 
53
53
  from ftmq.query import Query
54
- from ftmq.store.base import Store
54
+ from ftmq.store.base import DEFAULT_ORIGIN, Store
55
55
  from ftmq.store.sql import SQLQueryView, SQLStore
56
56
  from ftmq.types import StatementEntities
57
57
  from ftmq.util import apply_dataset, ensure_entity, get_scope_dataset
58
58
 
59
59
  log = get_logger(__name__)
60
60
 
61
- Z_ORDER = ["canonical_id", "entity_id", "schema", "prop"]
61
+ Z_ORDER = ["canonical_id", "prop"] # don't add more columns here
62
62
  TARGET_SIZE = 50 * 10_485_760 # 500 MB
63
63
  PARTITION_BY = ["dataset", "bucket", "origin"]
64
- DEFAULT_ORIGIN = "default"
64
+ BUCKET_MENTION = "mention"
65
+ BUCKET_PAGE = "page"
66
+ BUCKET_PAGES = "pages"
65
67
  BUCKET_DOCUMENT = "document"
66
68
  BUCKET_INTERVAL = "interval"
67
69
  BUCKET_THING = "thing"
@@ -82,9 +84,15 @@ WRITER = WriterProperties(
82
84
  "schema": STATISTICS,
83
85
  "prop": STATISTICS_BLOOM,
84
86
  "value": STATISTICS_BLOOM,
87
+ "last_seen": ColumnProperties(statistics_enabled="CHUNK"),
85
88
  },
86
89
  )
87
90
 
91
+ SA_TO_ARROW: dict[type, pa.DataType] = {
92
+ Boolean: pa.bool_(),
93
+ DateTime: pa.timestamp("us"),
94
+ }
95
+
88
96
  TABLE = table(
89
97
  nks.STATEMENT_TABLE,
90
98
  column("id"),
@@ -93,6 +101,7 @@ TABLE = table(
93
101
  column("dataset"),
94
102
  column("bucket"),
95
103
  column("origin"),
104
+ column("source"),
96
105
  column("schema"),
97
106
  column("prop"),
98
107
  column("prop_type"),
@@ -104,6 +113,10 @@ TABLE = table(
104
113
  column("last_seen", DateTime),
105
114
  )
106
115
 
116
+ ARROW_SCHEMA = pa.schema(
117
+ [(col.name, SA_TO_ARROW.get(type(col.type), pa.string())) for col in TABLE.columns]
118
+ )
119
+
107
120
 
108
121
  class StorageSettings(BaseSettings):
109
122
  model_config = SettingsConfigDict(env_file=".env", extra="ignore")
@@ -148,22 +161,26 @@ def storage_options() -> SDict:
148
161
  @cache
149
162
  def setup_duckdb_storage() -> None:
150
163
  if storage_settings.secret:
151
- duckdb.query(
152
- f"""CREATE OR REPLACE SECRET secret (
164
+ duckdb.query(f"""CREATE OR REPLACE SECRET secret (
153
165
  TYPE s3,
154
166
  PROVIDER config,
155
167
  KEY_ID '{storage_settings.key}',
156
168
  SECRET '{storage_settings.secret}',
157
- ENDPOINT '{storage_settings.endpoint}',
169
+ ENDPOINT '{storage_settings.duckdb_endpoint}',
158
170
  URL_STYLE 'path',
159
171
  USE_SSL '{not storage_settings.allow_http}'
160
- );"""
161
- )
172
+ );""")
162
173
 
163
174
 
164
175
  @cache
165
176
  def get_schema_bucket(schema_name: str) -> str:
166
177
  s = model[schema_name]
178
+ if s.is_a("Mention"):
179
+ return BUCKET_MENTION
180
+ if s.is_a("Page"):
181
+ return BUCKET_PAGE
182
+ if s.is_a("Pages"):
183
+ return BUCKET_PAGES
167
184
  if s.is_a("Document"):
168
185
  return BUCKET_DOCUMENT
169
186
  if s.is_a("Interval"):
@@ -171,19 +188,13 @@ def get_schema_bucket(schema_name: str) -> str:
171
188
  return BUCKET_THING
172
189
 
173
190
 
174
- def pack_statement(stmt: Statement) -> SDict:
191
+ def pack_statement(stmt: Statement, source: str | None = None) -> SDict:
175
192
  data = stmt.to_db_row()
176
193
  data["bucket"] = get_schema_bucket(data["schema"])
194
+ data["source"] = source
177
195
  return data
178
196
 
179
197
 
180
- def pack_statements(statements: Iterable[Statement]) -> pd.DataFrame:
181
- df = pd.DataFrame(map(pack_statement, statements))
182
- df = df.drop_duplicates() # .sort_values(Z_ORDER)
183
- df = df.fillna(np.nan)
184
- return df
185
-
186
-
187
198
  def compile_query(q: Select) -> str:
188
199
  table = nks.STATEMENT_TABLE
189
200
  sql = str(q.compile(compile_kwargs={"literal_binds": True}))
@@ -237,15 +248,12 @@ class LakeQueryView(SQLQueryView):
237
248
  yield from super().query(query)
238
249
 
239
250
 
240
- class LakeStore(SQLStore):
251
+ class LakeStore(SQLStore[LakeQueryView]):
241
252
  def __init__(self, *args, **kwargs) -> None:
242
- self._backend: FSStore = FSStore(uri=kwargs.pop("uri"))
253
+ self._backend = FSStore(uri=kwargs.pop("uri"))
243
254
  self._partition_by = kwargs.pop("partition_by", PARTITION_BY)
244
255
  self._lock: Lock = kwargs.pop("lock", Lock(self._backend))
245
256
  self._enforce_dataset = kwargs.pop("enforce_dataset", False)
246
- assert isinstance(
247
- self._backend, FSStore
248
- ), f"Invalid store backend: `{self._backend.__class__}"
249
257
  kwargs["uri"] = "sqlite:///:memory:" # fake it till you make it
250
258
  get_metadata.cache_clear()
251
259
  super().__init__(*args, **kwargs)
@@ -275,12 +283,14 @@ class LakeStore(SQLStore):
275
283
 
276
284
  def view(
277
285
  self, scope: Dataset | None = None, external: bool = False
278
- ) -> SQLQueryView:
286
+ ) -> LakeQueryView:
279
287
  scope = scope or self.dataset
280
288
  return LakeQueryView(self, scope, external)
281
289
 
282
- def writer(self, origin: str | None = DEFAULT_ORIGIN) -> "LakeWriter":
283
- return LakeWriter(self, origin=origin or DEFAULT_ORIGIN)
290
+ def writer(
291
+ self, origin: str | None = DEFAULT_ORIGIN, source: str | None = None
292
+ ) -> "LakeWriter":
293
+ return LakeWriter(self, origin=origin or DEFAULT_ORIGIN, source=source)
284
294
 
285
295
  def get_origins(self) -> set[str]:
286
296
  q = select(self.table.c.origin).distinct()
@@ -291,32 +301,55 @@ class LakeWriter(nk.Writer):
291
301
  store: LakeStore
292
302
  BATCH_STATEMENTS = 1_000_000
293
303
 
294
- def __init__(self, store: Store, origin: str | None = DEFAULT_ORIGIN):
304
+ def __init__(
305
+ self,
306
+ store: Store,
307
+ origin: str | None = DEFAULT_ORIGIN,
308
+ source: str | None = None,
309
+ ):
295
310
  super().__init__(store)
296
- self.batch: set[Statement] = set()
311
+ self.batch: dict[str, tuple[Statement, str | None]] = {}
297
312
  self.origin = origin or DEFAULT_ORIGIN
313
+ self.source = source
298
314
 
299
- def add_statement(self, stmt: Statement) -> None:
315
+ def add_statement(self, stmt: Statement, source: str | None = None) -> None:
300
316
  if stmt.entity_id is None:
301
317
  return
302
318
  stmt.origin = stmt.origin or self.origin
303
319
  canonical_id = self.store.linker.get_canonical(stmt.entity_id)
304
320
  stmt.canonical_id = canonical_id
305
- self.batch.add(stmt)
306
-
307
- def add_entity(self, entity: EntityProxy, origin: str | None = None) -> None:
321
+ key = f"{canonical_id}\t{stmt.id}"
322
+ self.batch[key] = (stmt, source or self.source)
323
+
324
+ def add_entity(
325
+ self,
326
+ entity: EntityProxy,
327
+ origin: str | None = None,
328
+ source: str | None = None,
329
+ ) -> None:
308
330
  e = ensure_entity(entity, StatementEntity, self.store.dataset)
309
331
  if self.store._enforce_dataset:
310
332
  e = apply_dataset(e, self.store.dataset, replace=True)
311
333
  for stmt in e.statements:
312
334
  if origin:
313
335
  stmt.origin = origin
314
- self.add_statement(stmt)
336
+ self.add_statement(stmt, source=source)
315
337
  # we check here instead of in `add_statement` as this will keep entities
316
- # together in the same parquet files`
338
+ # together in the same parquet files
317
339
  if len(self.batch) >= self.BATCH_STATEMENTS:
318
340
  self.flush()
319
341
 
342
+ def _pack_batches(self) -> Generator[pa.RecordBatch, None, None]:
343
+ batch: list[SDict] = []
344
+ for key in sorted(self.batch):
345
+ stmt, source = self.batch[key]
346
+ batch.append(pack_statement(stmt, source))
347
+ if len(batch) >= 100_000:
348
+ yield pa.RecordBatch.from_pylist(batch, schema=ARROW_SCHEMA)
349
+ batch = []
350
+ if batch:
351
+ yield pa.RecordBatch.from_pylist(batch, schema=ARROW_SCHEMA)
352
+
320
353
  def flush(self) -> None:
321
354
  if self.batch:
322
355
  log.info(
@@ -324,18 +357,22 @@ class LakeWriter(nk.Writer):
324
357
  uri=self.store.uri,
325
358
  )
326
359
  with self.store._lock:
360
+ reader = pa.RecordBatchReader.from_batches(
361
+ ARROW_SCHEMA, self._pack_batches()
362
+ )
327
363
  write_deltalake(
328
364
  str(self.store.uri),
329
- pack_statements(self.batch),
365
+ reader,
330
366
  partition_by=self.store._partition_by,
331
367
  mode="append",
332
368
  schema_mode="merge",
333
369
  writer_properties=WRITER,
334
370
  target_file_size=TARGET_SIZE,
335
371
  storage_options=storage_options(),
372
+ configuration={"delta.enableChangeDataFeed": "true"},
336
373
  )
337
374
 
338
- self.batch = set()
375
+ self.batch = {}
339
376
 
340
377
  def pop(self, entity_id: str) -> list[Statement]:
341
378
  q = select(TABLE)
@@ -348,18 +385,42 @@ class LakeWriter(nk.Writer):
348
385
  return statements
349
386
 
350
387
  def optimize(
351
- self, vacuum: bool | None = False, vacuum_keep_hours: int | None = 0
388
+ self,
389
+ vacuum: bool | None = False,
390
+ vacuum_keep_hours: int | None = 0,
391
+ dataset: str | None = None,
392
+ bucket: str | None = None,
393
+ origin: str | None = None,
352
394
  ) -> None:
353
395
  """
354
396
  Optimize the storage: Z-Ordering and compacting
397
+
398
+ Args:
399
+ vacuum: Run vacuum after optimization
400
+ vacuum_keep_hours: Retention hours for vacuum
401
+ dataset: Filter optimization to specific dataset partition
402
+ bucket: Filter optimization to specific bucket partition
403
+ origin: Filter optimization to specific origin partition
355
404
  """
356
- self.store.deltatable.optimize.z_order(
357
- Z_ORDER, writer_properties=WRITER, target_size=TARGET_SIZE
358
- )
359
- if vacuum:
360
- self.store.deltatable.vacuum(
361
- retention_hours=vacuum_keep_hours,
362
- enforce_retention_duration=False,
363
- dry_run=False,
364
- full=True,
405
+ filters: FilterConjunctionType = []
406
+ if dataset is not None:
407
+ filters.append(("dataset", "=", dataset))
408
+ if bucket is not None:
409
+ filters.append(("bucket", "=", bucket))
410
+ if origin is not None:
411
+ filters.append(("origin", "=", origin))
412
+
413
+ with self.store._lock:
414
+ self.store.deltatable.optimize.z_order(
415
+ Z_ORDER,
416
+ writer_properties=WRITER,
417
+ target_size=TARGET_SIZE,
418
+ partition_filters=filters or None,
365
419
  )
420
+ if vacuum:
421
+ self.store.deltatable.vacuum(
422
+ retention_hours=vacuum_keep_hours,
423
+ enforce_retention_duration=False,
424
+ dry_run=False,
425
+ full=True,
426
+ )
@@ -1,12 +1,14 @@
1
1
  import os
2
2
  from collections import defaultdict
3
3
  from decimal import Decimal
4
+ from typing import Generic
4
5
 
5
6
  from anystore.util import clean_dict
6
7
  from followthemoney.dataset.dataset import Dataset
7
8
  from nomenklatura.db import get_metadata
8
9
  from nomenklatura.store import sql as nk
9
10
  from sqlalchemy import select
11
+ from typing_extensions import TypeVar
10
12
 
11
13
  from ftmq.aggregations import AggregatorResult
12
14
  from ftmq.enums import Fields
@@ -16,6 +18,8 @@ from ftmq.store.base import Store, View
16
18
  from ftmq.types import StatementEntities
17
19
  from ftmq.util import get_scope_dataset
18
20
 
21
+ V = TypeVar("V", bound=View, default="SQLQueryView")
22
+
19
23
  MAX_SQL_AGG_GROUPS = int(os.environ.get("MAX_SQL_AGG_GROUPS", 10))
20
24
 
21
25
 
@@ -125,7 +129,7 @@ class SQLQueryView(View, nk.SQLView):
125
129
  return res
126
130
 
127
131
 
128
- class SQLStore(Store, nk.SQLStore):
132
+ class SQLStore(Store[V], nk.SQLStore, Generic[V]):
129
133
  def __init__(self, *args, **kwargs) -> None:
130
134
  get_metadata.cache_clear() # FIXME
131
135
  super().__init__(*args, **kwargs)
@@ -137,8 +141,6 @@ class SQLStore(Store, nk.SQLStore):
137
141
  names.add(row[0])
138
142
  return get_scope_dataset(*names)
139
143
 
140
- def view(
141
- self, scope: Dataset | None = None, external: bool = False
142
- ) -> SQLQueryView:
144
+ def view(self, scope: Dataset | None = None, external: bool = False) -> V:
143
145
  scope = scope or self.dataset
144
- return SQLQueryView(self, scope, external=external)
146
+ return SQLQueryView(self, scope, external=external) # type: ignore[return-value]
@@ -1,7 +1,5 @@
1
- from functools import lru_cache
2
1
  from typing import Any, Generator, Type
3
2
 
4
- import pycountry
5
3
  from anystore.functools import weakref_cache as cache
6
4
  from anystore.types import SDict, StrGenerator
7
5
  from banal import ensure_list, is_listish
@@ -16,6 +14,7 @@ from followthemoney.util import make_entity_id, sanitize_text
16
14
  from normality import latinize_text, slugify, squash_spaces
17
15
  from rigour.names import Name, Symbol, tag_org_name, tag_person_name
18
16
  from rigour.names.tokenize import normalize_name
17
+ from rigour.territories import lookup_territory
19
18
  from rigour.text.scripts import can_latinize
20
19
 
21
20
  from ftmq.enums import Comparators
@@ -149,7 +148,7 @@ def apply_dataset(entity: E, dataset: str | Dataset, replace: bool | None = Fals
149
148
  def get_country_name(code: str) -> str:
150
149
  """
151
150
  Get the (english) country name for the given 2-letter iso code via
152
- [pycountry](https://pypi.org/project/pycountry/)
151
+ [rigour.territories](https://rigour.followthemoney.tech/territories/)
153
152
 
154
153
  Examples:
155
154
  >>> get_country_name("de")
@@ -165,22 +164,17 @@ def get_country_name(code: str) -> str:
165
164
  Returns:
166
165
  Either the country name for a valid code or the code as fallback.
167
166
  """
168
- code_clean = get_country_code(code)
169
- if code_clean is None:
170
- code_clean = code.lower()
171
- try:
172
- country = pycountry.countries.get(alpha_2=code_clean)
173
- if country is not None:
174
- return country.name
175
- except (LookupError, AttributeError):
176
- return code
177
- return code_clean
167
+ territory = lookup_territory(code)
168
+ if territory is not None:
169
+ return territory.name
170
+ return code
178
171
 
179
172
 
180
- @lru_cache(1024)
173
+ @cache
181
174
  def get_country_code(value: Any, splitter: str | None = ",") -> str | None:
182
175
  """
183
- Get the 2-letter iso country code for an arbitrary country name
176
+ Get the 2-letter iso country code for an arbitrary country name via
177
+ [rigour.territories](https://rigour.followthemoney.tech/territories/)
184
178
 
185
179
  Examples:
186
180
  >>> get_country_code("Germany")
@@ -201,15 +195,16 @@ def get_country_code(value: Any, splitter: str | None = ",") -> str | None:
201
195
  """
202
196
  value = clean_string(value)
203
197
  if not value:
204
- return
205
- code = registry.country.clean_text(value)
206
- if code:
207
- return code
208
- for token in value.split(splitter):
209
- code = registry.country.clean_text(token)
210
- if code:
211
- return code
212
- return
198
+ return None
199
+ territory = lookup_territory(value)
200
+ if territory is not None:
201
+ return territory.ftm_country
202
+ if splitter:
203
+ for token in value.split(splitter):
204
+ territory = lookup_territory(token.strip())
205
+ if territory is not None:
206
+ return territory.ftm_country
207
+ return None
213
208
 
214
209
 
215
210
  def join_slug(
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ftmq"
3
- version = "4.3.2"
3
+ version = "4.5.2"
4
4
  description = "followthemoney query dsl and io helpers"
5
5
  authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
6
6
  license = "AGPLv3+"
@@ -12,19 +12,19 @@ classifiers = [
12
12
  "Programming Language :: Python :: 3.11",
13
13
  "Programming Language :: Python :: 3.12",
14
14
  "Programming Language :: Python :: 3.13",
15
+ # "Programming Language :: Python :: 3.14",
15
16
  ]
16
- requires-python = ">=3.11,<4"
17
+ requires-python = ">=3.11,<3.14"
17
18
  dependencies = [
18
- "anystore (>=0.4.0,<0.5.0)",
19
- "followthemoney (>=4.3.2,<5.0.0)",
20
- "nomenklatura (>=4.1.10,<5.0.0)",
21
- "rigour (>=1.4.1,<2.0.0)",
19
+ "anystore (>=1.0.0,<2.0.0)",
20
+ "followthemoney (>=4.5.1,<5.0.0)",
21
+ "nomenklatura (>=4.6.1,<5.0.0)",
22
+ "rigour (>=1.6.2,<2.0.0)",
22
23
  "click (>=8.2.1,<9.0.0)",
23
24
  "click-default-group (>=1.2.4,<2.0.0)",
24
25
  "orjson (>=3.10.18,<4.0.0)",
25
26
  "pyicu (>=2.15.2,<3.0.0)",
26
27
  "pydantic (>=2.11.3,<3.0.0)",
27
- "pycountry (>=24.6.1,<25.0.0)",
28
28
  ]
29
29
 
30
30
  [project.optional-dependencies]
@@ -33,10 +33,10 @@ sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
33
33
  postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
34
34
  redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
35
35
  lake = [
36
- "duckdb (>=1.4.1,<2.0.0)",
37
- "pandas (>=2.3.3,<3.0.0)",
38
- "deltalake (>=1.2.1,<2.0.0)",
39
- "pyarrow (>=21.0.0,<22.0.0)",
36
+ "duckdb (>=1.4.4,<2.0.0)",
37
+ "deltalake (>=1.4.1,<2.0.0)",
38
+ "pyarrow (>=23.0.0,<24.0.0)",
39
+ "pandas (>=3.0.0,<4.0.0)",
40
40
  ]
41
41
  aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
42
42
 
@@ -50,10 +50,10 @@ Repository = "https://github.com/dataresearchcenter/ftmq"
50
50
  Issues = "https://github.com/dataresearchcenter/ftmq/issues"
51
51
 
52
52
  [tool.poetry.group.dev.dependencies]
53
- pytest = ">=7.4.3,<9.0.0"
53
+ pytest = ">=7.4.3,<10.0.0"
54
54
  pytest-cov = ">=4.1,<8.0"
55
55
  pytest-env = "^1.1.1"
56
- black = ">=23.11,<26.0"
56
+ black = ">=23.11,<27.0"
57
57
  isort = "^7.0.0"
58
58
  mypy = "^1.17.1"
59
59
  pre-commit = "^4.0.1"
@@ -62,7 +62,7 @@ ipdb = "^0.13.13"
62
62
  bump2version = "^1.0.1"
63
63
  mkdocs = "^1.6.1"
64
64
  mkdocs-autorefs = "^1.4.3"
65
- mkdocstrings-python = "^1.18.2"
65
+ mkdocstrings-python = "^2.0.0"
66
66
  mkdocs-material = "^9.6.18"
67
67
  mkdocs-click = "^0.9.0"
68
68
 
@@ -1,105 +0,0 @@
1
- import logging
2
- import os
3
- import sys
4
- from logging import Filter, LogRecord
5
- from typing import Any, Dict, List
6
-
7
- import structlog
8
- from banal import as_bool
9
- from structlog.contextvars import merge_contextvars
10
- from structlog.dev import ConsoleRenderer, set_exc_info
11
- from structlog.processors import (
12
- JSONRenderer,
13
- TimeStamper,
14
- UnicodeDecoder,
15
- add_log_level,
16
- format_exc_info,
17
- )
18
- from structlog.stdlib import (
19
- BoundLogger,
20
- LoggerFactory,
21
- ProcessorFormatter,
22
- add_logger_name,
23
- )
24
- from structlog.stdlib import get_logger as get_raw_logger
25
-
26
- LOG_JSON = as_bool(os.environ.get("LOG_JSON"))
27
- LOG_LEVEL = os.environ.get("LOG_LEVEL", "info").upper()
28
-
29
-
30
- def get_logger(name: str, *args, **kwargs) -> BoundLogger:
31
- return get_raw_logger(name, *args, **kwargs)
32
-
33
-
34
- def configure_logging(level: int = logging.INFO) -> None:
35
- """Configure log levels and structured logging"""
36
- shared_processors: List[Any] = [
37
- add_log_level,
38
- add_logger_name,
39
- # structlog.stdlib.PositionalArgumentsFormatter(),
40
- # structlog.processors.StackInfoRenderer(),
41
- merge_contextvars,
42
- set_exc_info,
43
- TimeStamper(fmt="iso"),
44
- # format_exc_info,
45
- UnicodeDecoder(),
46
- ]
47
-
48
- if LOG_JSON:
49
- shared_processors.append(format_exc_info)
50
- shared_processors.append(format_json)
51
- formatter = ProcessorFormatter(
52
- foreign_pre_chain=shared_processors,
53
- processor=JSONRenderer(),
54
- )
55
- else:
56
- formatter = ProcessorFormatter(
57
- foreign_pre_chain=shared_processors,
58
- processor=ConsoleRenderer(
59
- exception_formatter=structlog.dev.plain_traceback
60
- ),
61
- )
62
-
63
- processors = shared_processors + [
64
- ProcessorFormatter.wrap_for_formatter,
65
- ]
66
-
67
- # configuration for structlog based loggers
68
- structlog.configure(
69
- cache_logger_on_first_use=True,
70
- # wrapper_class=AsyncBoundLogger,
71
- wrapper_class=BoundLogger,
72
- processors=processors,
73
- context_class=dict,
74
- logger_factory=LoggerFactory(),
75
- )
76
-
77
- # handler for low level logs that should be sent to STDERR
78
- out_handler = logging.StreamHandler(sys.stderr)
79
- out_handler.setLevel(level)
80
- out_handler.addFilter(_MaxLevelFilter(logging.WARNING))
81
- out_handler.setFormatter(formatter)
82
- # handler for high level logs that should be sent to STDERR
83
- error_handler = logging.StreamHandler(sys.stderr)
84
- error_handler.setLevel(logging.ERROR)
85
- error_handler.setFormatter(formatter)
86
-
87
- root_logger = logging.getLogger()
88
- root_logger.setLevel(LOG_LEVEL)
89
- root_logger.addHandler(out_handler)
90
- root_logger.addHandler(error_handler)
91
-
92
-
93
- def format_json(_: Any, __: Any, ed: Dict[str, str]) -> Dict[str, str]:
94
- """Stackdriver uses `message` and `severity` keys to display logs"""
95
- ed["message"] = ed.pop("event")
96
- ed["severity"] = ed.pop("level", "info").upper()
97
- return ed
98
-
99
-
100
- class _MaxLevelFilter(Filter):
101
- def __init__(self, highest_log_level: int) -> None:
102
- self._highest_log_level = highest_log_level
103
-
104
- def filter(self, log_record: LogRecord) -> bool:
105
- return log_record.levelno <= self._highest_log_level
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes