ftmq 4.2.4__tar.gz → 4.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ftmq might be problematic. Click here for more details.
- {ftmq-4.2.4 → ftmq-4.3.0}/PKG-INFO +8 -8
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/__init__.py +1 -1
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/cli.py +58 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/dataset.py +67 -6
- {ftmq-4.2.4 → ftmq-4.3.0}/pyproject.toml +11 -11
- {ftmq-4.2.4 → ftmq-4.3.0}/LICENSE +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/NOTICE +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/README.md +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/aggregate.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/aggregations.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/enums.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/filters.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/io.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/logging.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/__init__.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/dataset.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/entity.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/mixins.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/model/stats.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/query.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/similar.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/sql.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/__init__.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/aleph.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/base.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/__init__.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/loader.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/settings.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/store.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/fragments/utils.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/lake.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/level.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/memory.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/redis.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/store/sql.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/types.py +0 -0
- {ftmq-4.2.4 → ftmq-4.3.0}/ftmq/util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ftmq
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.3.0
|
|
4
4
|
Summary: followthemoney query dsl and io helpers
|
|
5
5
|
License: AGPLv3+
|
|
6
6
|
Author: Simon Wörpel
|
|
@@ -19,17 +19,17 @@ Provides-Extra: postgres
|
|
|
19
19
|
Provides-Extra: redis
|
|
20
20
|
Provides-Extra: sql
|
|
21
21
|
Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
|
|
22
|
-
Requires-Dist: anystore (>=0.
|
|
22
|
+
Requires-Dist: anystore (>=0.4.0,<0.5.0)
|
|
23
23
|
Requires-Dist: click (>=8.2.1,<9.0.0)
|
|
24
24
|
Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
|
|
25
|
-
Requires-Dist: deltalake (>=1.
|
|
26
|
-
Requires-Dist: duckdb (>=1.
|
|
25
|
+
Requires-Dist: deltalake (>=1.2.0,<2.0.0) ; extra == "lake"
|
|
26
|
+
Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
|
|
27
27
|
Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
|
|
28
|
-
Requires-Dist: followthemoney (>=4.
|
|
28
|
+
Requires-Dist: followthemoney (>=4.3.0,<5.0.0)
|
|
29
29
|
Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
|
|
30
|
-
Requires-Dist: nomenklatura (>=4.1.
|
|
30
|
+
Requires-Dist: nomenklatura (>=4.1.9,<5.0.0)
|
|
31
31
|
Requires-Dist: orjson (>=3.10.18,<4.0.0)
|
|
32
|
-
Requires-Dist: pandas (>=2.3.
|
|
32
|
+
Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
|
|
33
33
|
Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
|
|
34
34
|
Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
|
|
35
35
|
Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
|
|
@@ -37,7 +37,7 @@ Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
|
37
37
|
Requires-Dist: pydantic (>=2.11.3,<3.0.0)
|
|
38
38
|
Requires-Dist: pyicu (>=2.15.2,<3.0.0)
|
|
39
39
|
Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
|
|
40
|
-
Requires-Dist: rigour (>=1.3.
|
|
40
|
+
Requires-Dist: rigour (>=1.3.13,<2.0.0)
|
|
41
41
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
|
|
42
42
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
|
|
43
43
|
Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
|
|
@@ -11,6 +11,9 @@ from ftmq.model.dataset import Catalog, Dataset
|
|
|
11
11
|
from ftmq.model.stats import Collector
|
|
12
12
|
from ftmq.query import Query
|
|
13
13
|
from ftmq.store import get_store
|
|
14
|
+
from ftmq.store.fragments import get_fragments
|
|
15
|
+
from ftmq.store.fragments import get_store as get_fragments_store
|
|
16
|
+
from ftmq.store.fragments.settings import Settings as FragmentsSettings
|
|
14
17
|
from ftmq.util import apply_dataset, parse_unknown_filters
|
|
15
18
|
|
|
16
19
|
log = get_logger(__name__)
|
|
@@ -311,6 +314,61 @@ def store_iterate(
|
|
|
311
314
|
smart_write_proxies(output_uri, store.iterate())
|
|
312
315
|
|
|
313
316
|
|
|
317
|
+
@cli.group()
|
|
318
|
+
def fragments():
|
|
319
|
+
pass
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
fragments_settings = FragmentsSettings()
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@fragments.command("list-datasets")
|
|
326
|
+
@click.option(
|
|
327
|
+
"-i",
|
|
328
|
+
"--input-uri",
|
|
329
|
+
default=fragments_settings.database_uri,
|
|
330
|
+
show_default=True,
|
|
331
|
+
help="input file or uri",
|
|
332
|
+
)
|
|
333
|
+
@click.option(
|
|
334
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
335
|
+
)
|
|
336
|
+
def fragments_list_datasets(
|
|
337
|
+
input_uri: str = fragments_settings.database_uri,
|
|
338
|
+
output_uri: str = "-",
|
|
339
|
+
):
|
|
340
|
+
"""
|
|
341
|
+
List datasets within a fragments store
|
|
342
|
+
"""
|
|
343
|
+
store = get_fragments_store(input_uri)
|
|
344
|
+
datasets = [ds.name for ds in store.all()]
|
|
345
|
+
smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
@fragments.command("iterate")
|
|
349
|
+
@click.option(
|
|
350
|
+
"-i",
|
|
351
|
+
"--input-uri",
|
|
352
|
+
default=fragments_settings.database_uri,
|
|
353
|
+
show_default=True,
|
|
354
|
+
help="fragments store input uri",
|
|
355
|
+
)
|
|
356
|
+
@click.option(
|
|
357
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
358
|
+
)
|
|
359
|
+
@click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
|
|
360
|
+
def fragments_iterate(
|
|
361
|
+
input_uri: str = fragments_settings.database_uri,
|
|
362
|
+
output_uri: str = "-",
|
|
363
|
+
dataset: str = None,
|
|
364
|
+
):
|
|
365
|
+
"""
|
|
366
|
+
Iterate all entities from a fragments dataset
|
|
367
|
+
"""
|
|
368
|
+
fragments = get_fragments(dataset, database_uri=input_uri)
|
|
369
|
+
smart_write_proxies(output_uri, fragments.iterate())
|
|
370
|
+
|
|
371
|
+
|
|
314
372
|
@cli.command("aggregate")
|
|
315
373
|
@click.option(
|
|
316
374
|
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from typing import Generator, Iterable, TypeAlias
|
|
4
5
|
|
|
@@ -9,6 +10,7 @@ from normality import slugify
|
|
|
9
10
|
from sqlalchemy import (
|
|
10
11
|
JSON,
|
|
11
12
|
Column,
|
|
13
|
+
Connection,
|
|
12
14
|
DateTime,
|
|
13
15
|
String,
|
|
14
16
|
Table,
|
|
@@ -43,6 +45,27 @@ except ImportError:
|
|
|
43
45
|
EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
|
|
44
46
|
|
|
45
47
|
|
|
48
|
+
@contextmanager
|
|
49
|
+
def disable_timeout(conn: Connection, store):
|
|
50
|
+
# for long running iterations (e.g. re-index in OpenAleph), for postgres we
|
|
51
|
+
# don't want to get cancelled if a idle_in_transaction_timeout is configured
|
|
52
|
+
# on the server
|
|
53
|
+
if store.is_postgres:
|
|
54
|
+
raw_conn = conn.connection.driver_connection
|
|
55
|
+
with raw_conn.cursor() as cursor:
|
|
56
|
+
cursor.execute("SET idle_in_transaction_session_timeout = 0")
|
|
57
|
+
try:
|
|
58
|
+
yield conn
|
|
59
|
+
finally:
|
|
60
|
+
if store.is_postgres:
|
|
61
|
+
try:
|
|
62
|
+
raw_conn = conn.connection.driver_connection
|
|
63
|
+
with raw_conn.cursor() as cursor:
|
|
64
|
+
cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
|
|
65
|
+
except Exception:
|
|
66
|
+
pass # Connection might be closed
|
|
67
|
+
|
|
68
|
+
|
|
46
69
|
class Fragments(object):
|
|
47
70
|
def __init__(self, store, name, origin=NULL_ORIGIN):
|
|
48
71
|
self.store = store
|
|
@@ -123,12 +146,13 @@ class Fragments(object):
|
|
|
123
146
|
# stmt = stmt.order_by(self.table.c.fragment)
|
|
124
147
|
conn = self.store.engine.connect()
|
|
125
148
|
try:
|
|
126
|
-
conn
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
149
|
+
with disable_timeout(conn, self.store) as conn:
|
|
150
|
+
conn = conn.execution_options(stream_results=True)
|
|
151
|
+
for ent in conn.execute(stmt):
|
|
152
|
+
data = {"id": ent.id, "datasets": [self.name], **ent.entity}
|
|
153
|
+
if ent.origin != NULL_ORIGIN:
|
|
154
|
+
data["origin"] = ent.origin
|
|
155
|
+
yield data
|
|
132
156
|
except Exception:
|
|
133
157
|
self.reset()
|
|
134
158
|
raise
|
|
@@ -146,6 +170,10 @@ class Fragments(object):
|
|
|
146
170
|
raise
|
|
147
171
|
|
|
148
172
|
def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
|
|
173
|
+
if entity_id is None:
|
|
174
|
+
log.info("Using batched iteration for complete dataset.")
|
|
175
|
+
yield from self.iterate_batched()
|
|
176
|
+
return
|
|
149
177
|
entity = None
|
|
150
178
|
invalid = None
|
|
151
179
|
fragments = 1
|
|
@@ -181,6 +209,39 @@ class Fragments(object):
|
|
|
181
209
|
if entity is not None:
|
|
182
210
|
yield entity
|
|
183
211
|
|
|
212
|
+
def iterate_batched(self, skip_errors=False, batch_size=10_000) -> EntityFragments:
|
|
213
|
+
"""
|
|
214
|
+
For large datasets an overall sort is not feasible, so we iterate in
|
|
215
|
+
sorted batched IDs.
|
|
216
|
+
"""
|
|
217
|
+
for entity_ids in self.get_sorted_id_batches(batch_size):
|
|
218
|
+
yield from self.iterate(entity_id=entity_ids, skip_errors=skip_errors)
|
|
219
|
+
|
|
220
|
+
def get_sorted_id_batches(
|
|
221
|
+
self, batch_size=10_000
|
|
222
|
+
) -> Generator[list[str], None, None]:
|
|
223
|
+
"""
|
|
224
|
+
Get sorted ID batches to speed up iteration and useful to parallelize
|
|
225
|
+
processing of iterator Entities
|
|
226
|
+
"""
|
|
227
|
+
last_id = None
|
|
228
|
+
with self.store.engine.connect() as conn:
|
|
229
|
+
while True:
|
|
230
|
+
stmt = select(self.table.c.id).distinct()
|
|
231
|
+
if last_id is not None:
|
|
232
|
+
stmt = stmt.where(self.table.c.id > last_id)
|
|
233
|
+
stmt = stmt.order_by(self.table.c.id).limit(batch_size)
|
|
234
|
+
try:
|
|
235
|
+
res = conn.execute(stmt)
|
|
236
|
+
entity_ids = [r.id for r in res.fetchall()]
|
|
237
|
+
if not entity_ids:
|
|
238
|
+
return
|
|
239
|
+
yield entity_ids
|
|
240
|
+
last_id = entity_ids[-1]
|
|
241
|
+
except Exception:
|
|
242
|
+
self.reset()
|
|
243
|
+
raise
|
|
244
|
+
|
|
184
245
|
def statements(
|
|
185
246
|
self,
|
|
186
247
|
entity_ids: Iterable[str] | None = None,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ftmq"
|
|
3
|
-
version = "4.
|
|
3
|
+
version = "4.3.0"
|
|
4
4
|
description = "followthemoney query dsl and io helpers"
|
|
5
5
|
authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
|
|
6
6
|
license = "AGPLv3+"
|
|
@@ -15,10 +15,10 @@ classifiers = [
|
|
|
15
15
|
]
|
|
16
16
|
requires-python = ">=3.11,<4"
|
|
17
17
|
dependencies = [
|
|
18
|
-
"anystore (>=0.
|
|
19
|
-
"followthemoney (>=4.
|
|
20
|
-
"nomenklatura (>=4.1.
|
|
21
|
-
"rigour (>=1.3.
|
|
18
|
+
"anystore (>=0.4.0,<0.5.0)",
|
|
19
|
+
"followthemoney (>=4.3.0,<5.0.0)",
|
|
20
|
+
"nomenklatura (>=4.1.9,<5.0.0)",
|
|
21
|
+
"rigour (>=1.3.13,<2.0.0)",
|
|
22
22
|
"click (>=8.2.1,<9.0.0)",
|
|
23
23
|
"click-default-group (>=1.2.4,<2.0.0)",
|
|
24
24
|
"orjson (>=3.10.18,<4.0.0)",
|
|
@@ -33,9 +33,9 @@ sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
|
|
|
33
33
|
postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
|
|
34
34
|
redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
|
|
35
35
|
lake = [
|
|
36
|
-
"duckdb (>=1.
|
|
37
|
-
"pandas (>=2.3.
|
|
38
|
-
"deltalake (>=1.
|
|
36
|
+
"duckdb (>=1.4.1,<2.0.0)",
|
|
37
|
+
"pandas (>=2.3.3,<3.0.0)",
|
|
38
|
+
"deltalake (>=1.2.0,<2.0.0)",
|
|
39
39
|
"pyarrow (>=21.0.0,<22.0.0)",
|
|
40
40
|
]
|
|
41
41
|
aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
|
|
@@ -51,10 +51,10 @@ Issues = "https://github.com/dataresearchcenter/ftmq/issues"
|
|
|
51
51
|
|
|
52
52
|
[tool.poetry.group.dev.dependencies]
|
|
53
53
|
pytest = ">=7.4.3,<9.0.0"
|
|
54
|
-
pytest-cov = ">=4.1,<
|
|
54
|
+
pytest-cov = ">=4.1,<8.0"
|
|
55
55
|
pytest-env = "^1.1.1"
|
|
56
56
|
black = ">=23.11,<26.0"
|
|
57
|
-
isort = "^6.0
|
|
57
|
+
isort = "^6.1.0"
|
|
58
58
|
mypy = "^1.17.1"
|
|
59
59
|
pre-commit = "^4.0.1"
|
|
60
60
|
flake8 = ">=6.1,<8.0"
|
|
@@ -62,7 +62,7 @@ ipdb = "^0.13.13"
|
|
|
62
62
|
bump2version = "^1.0.1"
|
|
63
63
|
mkdocs = "^1.6.1"
|
|
64
64
|
mkdocs-autorefs = "^1.4.3"
|
|
65
|
-
mkdocstrings-python = "^1.18.
|
|
65
|
+
mkdocstrings-python = "^1.18.2"
|
|
66
66
|
mkdocs-material = "^9.6.18"
|
|
67
67
|
mkdocs-click = "^0.9.0"
|
|
68
68
|
|
|
File without changes
|
{ftmq-4.2.4 → ftmq-4.3.0}/NOTICE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|