ftmq 4.2.5__tar.gz → 4.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ftmq might be problematic. Click here for more details.
- {ftmq-4.2.5 → ftmq-4.3.0}/PKG-INFO +6 -6
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/__init__.py +1 -1
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/cli.py +58 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/fragments/dataset.py +37 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/pyproject.toml +6 -6
- {ftmq-4.2.5 → ftmq-4.3.0}/LICENSE +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/NOTICE +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/README.md +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/aggregate.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/aggregations.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/enums.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/filters.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/io.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/logging.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/model/__init__.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/model/dataset.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/model/entity.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/model/mixins.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/model/stats.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/query.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/similar.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/sql.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/__init__.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/aleph.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/base.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/fragments/__init__.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/fragments/loader.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/fragments/settings.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/fragments/store.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/fragments/utils.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/lake.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/level.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/memory.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/redis.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/store/sql.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/types.py +0 -0
- {ftmq-4.2.5 → ftmq-4.3.0}/ftmq/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ftmq
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.3.0
|
|
4
4
|
Summary: followthemoney query dsl and io helpers
|
|
5
5
|
License: AGPLv3+
|
|
6
6
|
Author: Simon Wörpel
|
|
@@ -22,14 +22,14 @@ Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
|
|
|
22
22
|
Requires-Dist: anystore (>=0.4.0,<0.5.0)
|
|
23
23
|
Requires-Dist: click (>=8.2.1,<9.0.0)
|
|
24
24
|
Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
|
|
25
|
-
Requires-Dist: deltalake (>=1.
|
|
26
|
-
Requires-Dist: duckdb (>=1.4.
|
|
25
|
+
Requires-Dist: deltalake (>=1.2.0,<2.0.0) ; extra == "lake"
|
|
26
|
+
Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
|
|
27
27
|
Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
|
|
28
|
-
Requires-Dist: followthemoney (>=4.
|
|
28
|
+
Requires-Dist: followthemoney (>=4.3.0,<5.0.0)
|
|
29
29
|
Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
|
|
30
30
|
Requires-Dist: nomenklatura (>=4.1.9,<5.0.0)
|
|
31
31
|
Requires-Dist: orjson (>=3.10.18,<4.0.0)
|
|
32
|
-
Requires-Dist: pandas (>=2.3.
|
|
32
|
+
Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
|
|
33
33
|
Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
|
|
34
34
|
Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
|
|
35
35
|
Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
|
|
@@ -37,7 +37,7 @@ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
|
37
37
|
Requires-Dist: pydantic (>=2.11.3,<3.0.0)
|
|
38
38
|
Requires-Dist: pyicu (>=2.15.2,<3.0.0)
|
|
39
39
|
Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
|
|
40
|
-
Requires-Dist: rigour (>=1.3.
|
|
40
|
+
Requires-Dist: rigour (>=1.3.13,<2.0.0)
|
|
41
41
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
|
|
42
42
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
|
|
43
43
|
Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
|
|
@@ -11,6 +11,9 @@ from ftmq.model.dataset import Catalog, Dataset
|
|
|
11
11
|
from ftmq.model.stats import Collector
|
|
12
12
|
from ftmq.query import Query
|
|
13
13
|
from ftmq.store import get_store
|
|
14
|
+
from ftmq.store.fragments import get_fragments
|
|
15
|
+
from ftmq.store.fragments import get_store as get_fragments_store
|
|
16
|
+
from ftmq.store.fragments.settings import Settings as FragmentsSettings
|
|
14
17
|
from ftmq.util import apply_dataset, parse_unknown_filters
|
|
15
18
|
|
|
16
19
|
log = get_logger(__name__)
|
|
@@ -311,6 +314,61 @@ def store_iterate(
|
|
|
311
314
|
smart_write_proxies(output_uri, store.iterate())
|
|
312
315
|
|
|
313
316
|
|
|
317
|
+
@cli.group()
|
|
318
|
+
def fragments():
|
|
319
|
+
pass
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
fragments_settings = FragmentsSettings()
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@fragments.command("list-datasets")
|
|
326
|
+
@click.option(
|
|
327
|
+
"-i",
|
|
328
|
+
"--input-uri",
|
|
329
|
+
default=fragments_settings.database_uri,
|
|
330
|
+
show_default=True,
|
|
331
|
+
help="input file or uri",
|
|
332
|
+
)
|
|
333
|
+
@click.option(
|
|
334
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
335
|
+
)
|
|
336
|
+
def fragments_list_datasets(
|
|
337
|
+
input_uri: str = fragments_settings.database_uri,
|
|
338
|
+
output_uri: str = "-",
|
|
339
|
+
):
|
|
340
|
+
"""
|
|
341
|
+
List datasets within a fragments store
|
|
342
|
+
"""
|
|
343
|
+
store = get_fragments_store(input_uri)
|
|
344
|
+
datasets = [ds.name for ds in store.all()]
|
|
345
|
+
smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
@fragments.command("iterate")
|
|
349
|
+
@click.option(
|
|
350
|
+
"-i",
|
|
351
|
+
"--input-uri",
|
|
352
|
+
default=fragments_settings.database_uri,
|
|
353
|
+
show_default=True,
|
|
354
|
+
help="fragments store input uri",
|
|
355
|
+
)
|
|
356
|
+
@click.option(
|
|
357
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
358
|
+
)
|
|
359
|
+
@click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
|
|
360
|
+
def fragments_iterate(
|
|
361
|
+
input_uri: str = fragments_settings.database_uri,
|
|
362
|
+
output_uri: str = "-",
|
|
363
|
+
dataset: str = None,
|
|
364
|
+
):
|
|
365
|
+
"""
|
|
366
|
+
Iterate all entities from a fragments dataset
|
|
367
|
+
"""
|
|
368
|
+
fragments = get_fragments(dataset, database_uri=input_uri)
|
|
369
|
+
smart_write_proxies(output_uri, fragments.iterate())
|
|
370
|
+
|
|
371
|
+
|
|
314
372
|
@cli.command("aggregate")
|
|
315
373
|
@click.option(
|
|
316
374
|
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
|
|
@@ -170,6 +170,10 @@ class Fragments(object):
|
|
|
170
170
|
raise
|
|
171
171
|
|
|
172
172
|
def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
|
|
173
|
+
if entity_id is None:
|
|
174
|
+
log.info("Using batched iteration for complete dataset.")
|
|
175
|
+
yield from self.iterate_batched()
|
|
176
|
+
return
|
|
173
177
|
entity = None
|
|
174
178
|
invalid = None
|
|
175
179
|
fragments = 1
|
|
@@ -205,6 +209,39 @@ class Fragments(object):
|
|
|
205
209
|
if entity is not None:
|
|
206
210
|
yield entity
|
|
207
211
|
|
|
212
|
+
def iterate_batched(self, skip_errors=False, batch_size=10_000) -> EntityFragments:
|
|
213
|
+
"""
|
|
214
|
+
For large datasets an overall sort is not feasible, so we iterate in
|
|
215
|
+
sorted batched IDs.
|
|
216
|
+
"""
|
|
217
|
+
for entity_ids in self.get_sorted_id_batches(batch_size):
|
|
218
|
+
yield from self.iterate(entity_id=entity_ids, skip_errors=skip_errors)
|
|
219
|
+
|
|
220
|
+
def get_sorted_id_batches(
|
|
221
|
+
self, batch_size=10_000
|
|
222
|
+
) -> Generator[list[str], None, None]:
|
|
223
|
+
"""
|
|
224
|
+
Get sorted ID batches to speed up iteration and useful to parallelize
|
|
225
|
+
processing of iterator Entities
|
|
226
|
+
"""
|
|
227
|
+
last_id = None
|
|
228
|
+
with self.store.engine.connect() as conn:
|
|
229
|
+
while True:
|
|
230
|
+
stmt = select(self.table.c.id).distinct()
|
|
231
|
+
if last_id is not None:
|
|
232
|
+
stmt = stmt.where(self.table.c.id > last_id)
|
|
233
|
+
stmt = stmt.order_by(self.table.c.id).limit(batch_size)
|
|
234
|
+
try:
|
|
235
|
+
res = conn.execute(stmt)
|
|
236
|
+
entity_ids = [r.id for r in res.fetchall()]
|
|
237
|
+
if not entity_ids:
|
|
238
|
+
return
|
|
239
|
+
yield entity_ids
|
|
240
|
+
last_id = entity_ids[-1]
|
|
241
|
+
except Exception:
|
|
242
|
+
self.reset()
|
|
243
|
+
raise
|
|
244
|
+
|
|
208
245
|
def statements(
|
|
209
246
|
self,
|
|
210
247
|
entity_ids: Iterable[str] | None = None,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ftmq"
|
|
3
|
-
version = "4.
|
|
3
|
+
version = "4.3.0"
|
|
4
4
|
description = "followthemoney query dsl and io helpers"
|
|
5
5
|
authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
|
|
6
6
|
license = "AGPLv3+"
|
|
@@ -16,9 +16,9 @@ classifiers = [
|
|
|
16
16
|
requires-python = ">=3.11,<4"
|
|
17
17
|
dependencies = [
|
|
18
18
|
"anystore (>=0.4.0,<0.5.0)",
|
|
19
|
-
"followthemoney (>=4.
|
|
19
|
+
"followthemoney (>=4.3.0,<5.0.0)",
|
|
20
20
|
"nomenklatura (>=4.1.9,<5.0.0)",
|
|
21
|
-
"rigour (>=1.3.
|
|
21
|
+
"rigour (>=1.3.13,<2.0.0)",
|
|
22
22
|
"click (>=8.2.1,<9.0.0)",
|
|
23
23
|
"click-default-group (>=1.2.4,<2.0.0)",
|
|
24
24
|
"orjson (>=3.10.18,<4.0.0)",
|
|
@@ -33,9 +33,9 @@ sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
|
|
|
33
33
|
postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
|
|
34
34
|
redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
|
|
35
35
|
lake = [
|
|
36
|
-
"duckdb (>=1.4.
|
|
37
|
-
"pandas (>=2.3.
|
|
38
|
-
"deltalake (>=1.
|
|
36
|
+
"duckdb (>=1.4.1,<2.0.0)",
|
|
37
|
+
"pandas (>=2.3.3,<3.0.0)",
|
|
38
|
+
"deltalake (>=1.2.0,<2.0.0)",
|
|
39
39
|
"pyarrow (>=21.0.0,<22.0.0)",
|
|
40
40
|
]
|
|
41
41
|
aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
|
|
File without changes
|
{ftmq-4.2.5 → ftmq-4.3.0}/NOTICE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|