ftmq 4.2.5__py3-none-any.whl → 4.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ftmq/__init__.py CHANGED
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.2.5"
5
+ __version__ = "4.3.0"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
ftmq/cli.py CHANGED
@@ -11,6 +11,9 @@ from ftmq.model.dataset import Catalog, Dataset
11
11
  from ftmq.model.stats import Collector
12
12
  from ftmq.query import Query
13
13
  from ftmq.store import get_store
14
+ from ftmq.store.fragments import get_fragments
15
+ from ftmq.store.fragments import get_store as get_fragments_store
16
+ from ftmq.store.fragments.settings import Settings as FragmentsSettings
14
17
  from ftmq.util import apply_dataset, parse_unknown_filters
15
18
 
16
19
  log = get_logger(__name__)
@@ -311,6 +314,61 @@ def store_iterate(
311
314
  smart_write_proxies(output_uri, store.iterate())
312
315
 
313
316
 
317
+ @cli.group()
318
+ def fragments():
319
+ pass
320
+
321
+
322
+ fragments_settings = FragmentsSettings()
323
+
324
+
325
+ @fragments.command("list-datasets")
326
+ @click.option(
327
+ "-i",
328
+ "--input-uri",
329
+ default=fragments_settings.database_uri,
330
+ show_default=True,
331
+ help="input file or uri",
332
+ )
333
+ @click.option(
334
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
335
+ )
336
+ def fragments_list_datasets(
337
+ input_uri: str = fragments_settings.database_uri,
338
+ output_uri: str = "-",
339
+ ):
340
+ """
341
+ List datasets within a fragments store
342
+ """
343
+ store = get_fragments_store(input_uri)
344
+ datasets = [ds.name for ds in store.all()]
345
+ smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
346
+
347
+
348
+ @fragments.command("iterate")
349
+ @click.option(
350
+ "-i",
351
+ "--input-uri",
352
+ default=fragments_settings.database_uri,
353
+ show_default=True,
354
+ help="fragments store input uri",
355
+ )
356
+ @click.option(
357
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
358
+ )
359
+ @click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
360
+ def fragments_iterate(
361
+ input_uri: str = fragments_settings.database_uri,
362
+ output_uri: str = "-",
363
+ dataset: str = None,
364
+ ):
365
+ """
366
+ Iterate all entities from a fragments dataset
367
+ """
368
+ fragments = get_fragments(dataset, database_uri=input_uri)
369
+ smart_write_proxies(output_uri, fragments.iterate())
370
+
371
+
314
372
  @cli.command("aggregate")
315
373
  @click.option(
316
374
  "-i", "--input-uri", default="-", show_default=True, help="input file or uri"
@@ -170,6 +170,10 @@ class Fragments(object):
170
170
  raise
171
171
 
172
172
  def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
173
+ if entity_id is None:
174
+ log.info("Using batched iteration for complete dataset.")
175
+ yield from self.iterate_batched()
176
+ return
173
177
  entity = None
174
178
  invalid = None
175
179
  fragments = 1
@@ -205,6 +209,39 @@ class Fragments(object):
205
209
  if entity is not None:
206
210
  yield entity
207
211
 
212
+ def iterate_batched(self, skip_errors=False, batch_size=10_000) -> EntityFragments:
213
+ """
214
+ For large datasets an overall sort is not feasible, so we iterate in
215
+ sorted batched IDs.
216
+ """
217
+ for entity_ids in self.get_sorted_id_batches(batch_size):
218
+ yield from self.iterate(entity_id=entity_ids, skip_errors=skip_errors)
219
+
220
+ def get_sorted_id_batches(
221
+ self, batch_size=10_000
222
+ ) -> Generator[list[str], None, None]:
223
+ """
224
+ Get sorted ID batches to speed up iteration and useful to parallelize
225
+ processing of iterator Entities
226
+ """
227
+ last_id = None
228
+ with self.store.engine.connect() as conn:
229
+ while True:
230
+ stmt = select(self.table.c.id).distinct()
231
+ if last_id is not None:
232
+ stmt = stmt.where(self.table.c.id > last_id)
233
+ stmt = stmt.order_by(self.table.c.id).limit(batch_size)
234
+ try:
235
+ res = conn.execute(stmt)
236
+ entity_ids = [r.id for r in res.fetchall()]
237
+ if not entity_ids:
238
+ return
239
+ yield entity_ids
240
+ last_id = entity_ids[-1]
241
+ except Exception:
242
+ self.reset()
243
+ raise
244
+
208
245
  def statements(
209
246
  self,
210
247
  entity_ids: Iterable[str] | None = None,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ftmq
3
- Version: 4.2.5
3
+ Version: 4.3.0
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
6
  Author: Simon Wörpel
@@ -22,14 +22,14 @@ Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
22
22
  Requires-Dist: anystore (>=0.4.0,<0.5.0)
23
23
  Requires-Dist: click (>=8.2.1,<9.0.0)
24
24
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
25
- Requires-Dist: deltalake (>=1.1.4,<2.0.0) ; extra == "lake"
26
- Requires-Dist: duckdb (>=1.4.0,<2.0.0) ; extra == "lake"
25
+ Requires-Dist: deltalake (>=1.2.0,<2.0.0) ; extra == "lake"
26
+ Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
27
27
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
28
- Requires-Dist: followthemoney (>=4.2.2,<5.0.0)
28
+ Requires-Dist: followthemoney (>=4.3.0,<5.0.0)
29
29
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
30
30
  Requires-Dist: nomenklatura (>=4.1.9,<5.0.0)
31
31
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
32
- Requires-Dist: pandas (>=2.3.2,<3.0.0) ; extra == "lake"
32
+ Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
33
33
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
34
34
  Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
35
35
  Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
@@ -37,7 +37,7 @@ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
37
37
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
38
38
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
39
39
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
40
- Requires-Dist: rigour (>=1.3.10,<2.0.0)
40
+ Requires-Dist: rigour (>=1.3.13,<2.0.0)
41
41
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
42
42
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
43
43
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -1,7 +1,7 @@
1
- ftmq/__init__.py,sha256=uJMQgEub6cttDVImesGqkC2HkgxIkYMHTloeCMrl2nQ,245
1
+ ftmq/__init__.py,sha256=yG0oVaBCNY1c6a2EIhTGutZEjNyiMgcn10v_8khS6sQ,245
2
2
  ftmq/aggregate.py,sha256=nyAI5w6jKG1T4Jf2yy1ruhPh0vY6p7JWOEYh0SLBdZY,2163
3
3
  ftmq/aggregations.py,sha256=YLu1WF3WgcI3tadWKqsoZk3f_3bYUJetIREy2N1u_EM,4794
4
- ftmq/cli.py,sha256=GScVXHnRlnZ-A22iMR5hCX4H3sHLrqyXnXMtKK-2b04,9837
4
+ ftmq/cli.py,sha256=roKQ1k-PUm15-mxD0fvjOwOhmpCBpuelYoECycfVjHw,11400
5
5
  ftmq/enums.py,sha256=4AJ6Ii8Bnbfz0BSznyK0IUopy-XMquuAuvud8ZcrD54,2521
6
6
  ftmq/filters.py,sha256=TKs454wbSvA5tPj8WbpIrMojctE2jGTLnrrVQKM9PE4,7908
7
7
  ftmq/io.py,sha256=gUbqoZuYXWwxoiJ505H7FhmXLYr5MEUiCEvqIiOkbgo,3789
@@ -18,7 +18,7 @@ ftmq/store/__init__.py,sha256=HH30KAHqo1kPr4qbjo6oKxdvgpF_XrqPgk8hSCkPbAY,3152
18
18
  ftmq/store/aleph.py,sha256=vENWpMgV1rViMwcef9wAwOGyrciCnSPRN-dSI103478,3977
19
19
  ftmq/store/base.py,sha256=1IgX4haeNUA9NTBCi-hBFVe0u44ra1nGdHAjSTOUEAs,4762
20
20
  ftmq/store/fragments/__init__.py,sha256=jHXHejqXe6sBNllAt-BuU24Ou8m5evmsD1UPac5J2GE,750
21
- ftmq/store/fragments/dataset.py,sha256=qVTkAl3erpNunbXb35cjwCB3nYRW8qjfLGuyRICLmLc,9135
21
+ ftmq/store/fragments/dataset.py,sha256=8mHJo7CW_9kyYgnXapvvZamaG33DXBy8lx8qmNbuD3s,10623
22
22
  ftmq/store/fragments/loader.py,sha256=iVh8F22IApe9MRY_Z2fOLvT80fCYstFyxu410l4pPQY,4066
23
23
  ftmq/store/fragments/settings.py,sha256=4c-BW-blVM9gC_IGPch03eExbZYFZ3V5h9yTfhcHvOI,303
24
24
  ftmq/store/fragments/store.py,sha256=LiSfg95LjEmyq2IUpX4CMtp2tE37SEBLtAw0EWufImM,2534
@@ -30,9 +30,9 @@ ftmq/store/redis.py,sha256=d0hkGF_BezdIfCMUshXWoQwvGmqT8JFblUMcCxzwkDA,433
30
30
  ftmq/store/sql.py,sha256=6h3-gDaTAlD-IkiOONcX-JbaAO9-QfSsMjjMPupclcQ,5216
31
31
  ftmq/types.py,sha256=HgF8eT3ynKnDUxBYFtoDytS-uN_CS7Yr3DHIX2r4tnk,774
32
32
  ftmq/util.py,sha256=CmbZXYAbsKbAjoWn8WxR1Sz4VPXc2gj9CkHwaTqpBG0,15691
33
- ftmq-4.2.5.dist-info/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
34
- ftmq-4.2.5.dist-info/METADATA,sha256=oyc17dvkbMkwDZH4qnNeUknc5toSzPvSYgJagx2Pgi4,5281
35
- ftmq-4.2.5.dist-info/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
36
- ftmq-4.2.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
37
- ftmq-4.2.5.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
38
- ftmq-4.2.5.dist-info/RECORD,,
33
+ ftmq-4.3.0.dist-info/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
34
+ ftmq-4.3.0.dist-info/METADATA,sha256=uOlBTQdtpSar7Bc4GRiLh3RQd9tlxWhuX0UOUd9pbn8,5281
35
+ ftmq-4.3.0.dist-info/NOTICE,sha256=LNgfzuMbk3kIP_KnyDiXO8rQJmDrLy_PQ7cAY8lCmMM,463
36
+ ftmq-4.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
37
+ ftmq-4.3.0.dist-info/entry_points.txt,sha256=YGDCjEiPgAMaQ5MqFKH8m-XIybehSXgarDucSlmeK3E,37
38
+ ftmq-4.3.0.dist-info/RECORD,,
File without changes
File without changes
File without changes