ftmq 4.2.4__tar.gz → 4.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ftmq might be problematic. Click here for more details.

Files changed (37) hide show
  1. {ftmq-4.2.4 → ftmq-4.3.0}/PKG-INFO +8 -8
  2. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/__init__.py +1 -1
  3. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/cli.py +58 -0
  4. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/dataset.py +67 -6
  5. {ftmq-4.2.4 → ftmq-4.3.0}/pyproject.toml +11 -11
  6. {ftmq-4.2.4 → ftmq-4.3.0}/LICENSE +0 -0
  7. {ftmq-4.2.4 → ftmq-4.3.0}/NOTICE +0 -0
  8. {ftmq-4.2.4 → ftmq-4.3.0}/README.md +0 -0
  9. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/aggregate.py +0 -0
  10. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/aggregations.py +0 -0
  11. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/enums.py +0 -0
  12. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/filters.py +0 -0
  13. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/io.py +0 -0
  14. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/logging.py +0 -0
  15. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/__init__.py +0 -0
  16. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/dataset.py +0 -0
  17. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/entity.py +0 -0
  18. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/mixins.py +0 -0
  19. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/stats.py +0 -0
  20. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/query.py +0 -0
  21. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/similar.py +0 -0
  22. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/sql.py +0 -0
  23. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/__init__.py +0 -0
  24. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/aleph.py +0 -0
  25. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/base.py +0 -0
  26. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/__init__.py +0 -0
  27. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/loader.py +0 -0
  28. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/settings.py +0 -0
  29. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/store.py +0 -0
  30. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/utils.py +0 -0
  31. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/lake.py +0 -0
  32. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/level.py +0 -0
  33. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/memory.py +0 -0
  34. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/redis.py +0 -0
  35. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/sql.py +0 -0
  36. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/types.py +0 -0
  37. {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ftmq
3
- Version: 4.2.4
3
+ Version: 4.3.0
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
6
  Author: Simon Wörpel
@@ -19,17 +19,17 @@ Provides-Extra: postgres
19
19
  Provides-Extra: redis
20
20
  Provides-Extra: sql
21
21
  Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
22
- Requires-Dist: anystore (>=0.3.11,<0.4.0)
22
+ Requires-Dist: anystore (>=0.4.0,<0.5.0)
23
23
  Requires-Dist: click (>=8.2.1,<9.0.0)
24
24
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
25
- Requires-Dist: deltalake (>=1.1.4,<2.0.0) ; extra == "lake"
26
- Requires-Dist: duckdb (>=1.3.2,<2.0.0) ; extra == "lake"
25
+ Requires-Dist: deltalake (>=1.2.0,<2.0.0) ; extra == "lake"
26
+ Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
27
27
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
28
- Requires-Dist: followthemoney (>=4.2.0,<5.0.0)
28
+ Requires-Dist: followthemoney (>=4.3.0,<5.0.0)
29
29
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
30
- Requires-Dist: nomenklatura (>=4.1.2,<5.0.0)
30
+ Requires-Dist: nomenklatura (>=4.1.9,<5.0.0)
31
31
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
32
- Requires-Dist: pandas (>=2.3.2,<3.0.0) ; extra == "lake"
32
+ Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
33
33
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
34
34
  Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
35
35
  Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
@@ -37,7 +37,7 @@ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
37
37
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
38
38
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
39
39
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
40
- Requires-Dist: rigour (>=1.3.0,<2.0.0)
40
+ Requires-Dist: rigour (>=1.3.13,<2.0.0)
41
41
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
42
42
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
43
43
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.2.4"
5
+ __version__ = "4.3.0"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
@@ -11,6 +11,9 @@ from ftmq.model.dataset import Catalog, Dataset
11
11
  from ftmq.model.stats import Collector
12
12
  from ftmq.query import Query
13
13
  from ftmq.store import get_store
14
+ from ftmq.store.fragments import get_fragments
15
+ from ftmq.store.fragments import get_store as get_fragments_store
16
+ from ftmq.store.fragments.settings import Settings as FragmentsSettings
14
17
  from ftmq.util import apply_dataset, parse_unknown_filters
15
18
 
16
19
  log = get_logger(__name__)
@@ -311,6 +314,61 @@ def store_iterate(
311
314
  smart_write_proxies(output_uri, store.iterate())
312
315
 
313
316
 
317
+ @cli.group()
318
+ def fragments():
319
+ pass
320
+
321
+
322
+ fragments_settings = FragmentsSettings()
323
+
324
+
325
+ @fragments.command("list-datasets")
326
+ @click.option(
327
+ "-i",
328
+ "--input-uri",
329
+ default=fragments_settings.database_uri,
330
+ show_default=True,
331
+ help="input file or uri",
332
+ )
333
+ @click.option(
334
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
335
+ )
336
+ def fragments_list_datasets(
337
+ input_uri: str = fragments_settings.database_uri,
338
+ output_uri: str = "-",
339
+ ):
340
+ """
341
+ List datasets within a fragments store
342
+ """
343
+ store = get_fragments_store(input_uri)
344
+ datasets = [ds.name for ds in store.all()]
345
+ smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
346
+
347
+
348
+ @fragments.command("iterate")
349
+ @click.option(
350
+ "-i",
351
+ "--input-uri",
352
+ default=fragments_settings.database_uri,
353
+ show_default=True,
354
+ help="fragments store input uri",
355
+ )
356
+ @click.option(
357
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
358
+ )
359
+ @click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
360
+ def fragments_iterate(
361
+ input_uri: str = fragments_settings.database_uri,
362
+ output_uri: str = "-",
363
+ dataset: str = None,
364
+ ):
365
+ """
366
+ Iterate all entities from a fragments dataset
367
+ """
368
+ fragments = get_fragments(dataset, database_uri=input_uri)
369
+ smart_write_proxies(output_uri, fragments.iterate())
370
+
371
+
314
372
  @cli.command("aggregate")
315
373
  @click.option(
316
374
  "-i", "--input-uri", default="-", show_default=True, help="input file or uri"
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from contextlib import contextmanager
2
3
  from datetime import datetime
3
4
  from typing import Generator, Iterable, TypeAlias
4
5
 
@@ -9,6 +10,7 @@ from normality import slugify
9
10
  from sqlalchemy import (
10
11
  JSON,
11
12
  Column,
13
+ Connection,
12
14
  DateTime,
13
15
  String,
14
16
  Table,
@@ -43,6 +45,27 @@ except ImportError:
43
45
  EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
44
46
 
45
47
 
48
+ @contextmanager
49
+ def disable_timeout(conn: Connection, store):
50
+ # for long running iterations (e.g. re-index in OpenAleph), for postgres we
51
+ # don't want to get cancelled if a idle_in_transaction_timeout is configured
52
+ # on the server
53
+ if store.is_postgres:
54
+ raw_conn = conn.connection.driver_connection
55
+ with raw_conn.cursor() as cursor:
56
+ cursor.execute("SET idle_in_transaction_session_timeout = 0")
57
+ try:
58
+ yield conn
59
+ finally:
60
+ if store.is_postgres:
61
+ try:
62
+ raw_conn = conn.connection.driver_connection
63
+ with raw_conn.cursor() as cursor:
64
+ cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
65
+ except Exception:
66
+ pass # Connection might be closed
67
+
68
+
46
69
  class Fragments(object):
47
70
  def __init__(self, store, name, origin=NULL_ORIGIN):
48
71
  self.store = store
@@ -123,12 +146,13 @@ class Fragments(object):
123
146
  # stmt = stmt.order_by(self.table.c.fragment)
124
147
  conn = self.store.engine.connect()
125
148
  try:
126
- conn = conn.execution_options(stream_results=True)
127
- for ent in conn.execute(stmt):
128
- data = {"id": ent.id, "datasets": [self.name], **ent.entity}
129
- if ent.origin != NULL_ORIGIN:
130
- data["origin"] = ent.origin
131
- yield data
149
+ with disable_timeout(conn, self.store) as conn:
150
+ conn = conn.execution_options(stream_results=True)
151
+ for ent in conn.execute(stmt):
152
+ data = {"id": ent.id, "datasets": [self.name], **ent.entity}
153
+ if ent.origin != NULL_ORIGIN:
154
+ data["origin"] = ent.origin
155
+ yield data
132
156
  except Exception:
133
157
  self.reset()
134
158
  raise
@@ -146,6 +170,10 @@ class Fragments(object):
146
170
  raise
147
171
 
148
172
  def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
173
+ if entity_id is None:
174
+ log.info("Using batched iteration for complete dataset.")
175
+ yield from self.iterate_batched()
176
+ return
149
177
  entity = None
150
178
  invalid = None
151
179
  fragments = 1
@@ -181,6 +209,39 @@ class Fragments(object):
181
209
  if entity is not None:
182
210
  yield entity
183
211
 
212
+ def iterate_batched(self, skip_errors=False, batch_size=10_000) -> EntityFragments:
213
+ """
214
+ For large datasets an overall sort is not feasible, so we iterate in
215
+ sorted batched IDs.
216
+ """
217
+ for entity_ids in self.get_sorted_id_batches(batch_size):
218
+ yield from self.iterate(entity_id=entity_ids, skip_errors=skip_errors)
219
+
220
+ def get_sorted_id_batches(
221
+ self, batch_size=10_000
222
+ ) -> Generator[list[str], None, None]:
223
+ """
224
+ Get sorted ID batches to speed up iteration and useful to parallelize
225
+ processing of iterator Entities
226
+ """
227
+ last_id = None
228
+ with self.store.engine.connect() as conn:
229
+ while True:
230
+ stmt = select(self.table.c.id).distinct()
231
+ if last_id is not None:
232
+ stmt = stmt.where(self.table.c.id > last_id)
233
+ stmt = stmt.order_by(self.table.c.id).limit(batch_size)
234
+ try:
235
+ res = conn.execute(stmt)
236
+ entity_ids = [r.id for r in res.fetchall()]
237
+ if not entity_ids:
238
+ return
239
+ yield entity_ids
240
+ last_id = entity_ids[-1]
241
+ except Exception:
242
+ self.reset()
243
+ raise
244
+
184
245
  def statements(
185
246
  self,
186
247
  entity_ids: Iterable[str] | None = None,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ftmq"
3
- version = "4.2.4"
3
+ version = "4.3.0"
4
4
  description = "followthemoney query dsl and io helpers"
5
5
  authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
6
6
  license = "AGPLv3+"
@@ -15,10 +15,10 @@ classifiers = [
15
15
  ]
16
16
  requires-python = ">=3.11,<4"
17
17
  dependencies = [
18
- "anystore (>=0.3.11,<0.4.0)",
19
- "followthemoney (>=4.2.0,<5.0.0)",
20
- "nomenklatura (>=4.1.2,<5.0.0)",
21
- "rigour (>=1.3.0,<2.0.0)",
18
+ "anystore (>=0.4.0,<0.5.0)",
19
+ "followthemoney (>=4.3.0,<5.0.0)",
20
+ "nomenklatura (>=4.1.9,<5.0.0)",
21
+ "rigour (>=1.3.13,<2.0.0)",
22
22
  "click (>=8.2.1,<9.0.0)",
23
23
  "click-default-group (>=1.2.4,<2.0.0)",
24
24
  "orjson (>=3.10.18,<4.0.0)",
@@ -33,9 +33,9 @@ sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
33
33
  postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
34
34
  redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
35
35
  lake = [
36
- "duckdb (>=1.3.2,<2.0.0)",
37
- "pandas (>=2.3.2,<3.0.0)",
38
- "deltalake (>=1.1.4,<2.0.0)",
36
+ "duckdb (>=1.4.1,<2.0.0)",
37
+ "pandas (>=2.3.3,<3.0.0)",
38
+ "deltalake (>=1.2.0,<2.0.0)",
39
39
  "pyarrow (>=21.0.0,<22.0.0)",
40
40
  ]
41
41
  aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
@@ -51,10 +51,10 @@ Issues = "https://github.com/dataresearchcenter/ftmq/issues"
51
51
 
52
52
  [tool.poetry.group.dev.dependencies]
53
53
  pytest = ">=7.4.3,<9.0.0"
54
- pytest-cov = ">=4.1,<7.0"
54
+ pytest-cov = ">=4.1,<8.0"
55
55
  pytest-env = "^1.1.1"
56
56
  black = ">=23.11,<26.0"
57
- isort = "^6.0.1"
57
+ isort = "^6.1.0"
58
58
  mypy = "^1.17.1"
59
59
  pre-commit = "^4.0.1"
60
60
  flake8 = ">=6.1,<8.0"
@@ -62,7 +62,7 @@ ipdb = "^0.13.13"
62
62
  bump2version = "^1.0.1"
63
63
  mkdocs = "^1.6.1"
64
64
  mkdocs-autorefs = "^1.4.3"
65
- mkdocstrings-python = "^1.18.0"
65
+ mkdocstrings-python = "^1.18.2"
66
66
  mkdocs-material = "^9.6.18"
67
67
  mkdocs-click = "^0.9.0"
68
68
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes