ftmq 4.1.1__tar.gz → 4.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ftmq-4.1.1 → ftmq-4.3.2}/PKG-INFO +12 -9
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/__init__.py +1 -1
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/cli.py +81 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/dataset.py +1 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/__init__.py +1 -1
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/base.py +1 -1
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/__init__.py +1 -1
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/dataset.py +143 -15
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/loader.py +13 -5
- ftmq-4.3.2/ftmq/store/fragments/store.py +71 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/lake.py +1 -1
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/util.py +80 -12
- {ftmq-4.1.1 → ftmq-4.3.2}/pyproject.toml +14 -13
- ftmq-4.1.1/ftmq/store/fragments/store.py +0 -43
- {ftmq-4.1.1 → ftmq-4.3.2}/LICENSE +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/NOTICE +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/README.md +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/aggregate.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/aggregations.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/enums.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/filters.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/io.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/logging.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/__init__.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/entity.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/mixins.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/model/stats.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/query.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/similar.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/sql.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/aleph.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/settings.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/fragments/utils.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/level.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/memory.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/redis.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/store/sql.py +0 -0
- {ftmq-4.1.1 → ftmq-4.3.2}/ftmq/types.py +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ftmq
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.3.2
|
|
4
4
|
Summary: followthemoney query dsl and io helpers
|
|
5
5
|
License: AGPLv3+
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: NOTICE
|
|
6
8
|
Author: Simon Wörpel
|
|
7
9
|
Author-email: simon.woerpel@pm.me
|
|
8
10
|
Requires-Python: >=3.11,<4
|
|
@@ -19,24 +21,25 @@ Provides-Extra: postgres
|
|
|
19
21
|
Provides-Extra: redis
|
|
20
22
|
Provides-Extra: sql
|
|
21
23
|
Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
|
|
22
|
-
Requires-Dist: anystore (>=0.
|
|
24
|
+
Requires-Dist: anystore (>=0.4.0,<0.5.0)
|
|
23
25
|
Requires-Dist: click (>=8.2.1,<9.0.0)
|
|
24
26
|
Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
|
|
25
|
-
Requires-Dist: deltalake (>=1.1
|
|
26
|
-
Requires-Dist: duckdb (>=1.
|
|
27
|
+
Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
|
|
28
|
+
Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
|
|
27
29
|
Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
|
|
28
|
-
Requires-Dist: followthemoney (>=4.
|
|
30
|
+
Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
|
|
29
31
|
Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
|
|
30
|
-
Requires-Dist: nomenklatura (>=4.1.
|
|
32
|
+
Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
|
|
31
33
|
Requires-Dist: orjson (>=3.10.18,<4.0.0)
|
|
32
|
-
Requires-Dist: pandas (>=2.3.
|
|
34
|
+
Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
|
|
33
35
|
Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
|
|
34
|
-
Requires-Dist:
|
|
36
|
+
Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
|
|
35
37
|
Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
|
|
36
38
|
Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
37
39
|
Requires-Dist: pydantic (>=2.11.3,<3.0.0)
|
|
38
40
|
Requires-Dist: pyicu (>=2.15.2,<3.0.0)
|
|
39
41
|
Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
|
|
42
|
+
Requires-Dist: rigour (>=1.4.1,<2.0.0)
|
|
40
43
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
|
|
41
44
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
|
|
42
45
|
Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
1
3
|
import click
|
|
2
4
|
from anystore.io import smart_write, smart_write_json, smart_write_model
|
|
3
5
|
from click_default_group import DefaultGroup
|
|
@@ -11,6 +13,9 @@ from ftmq.model.dataset import Catalog, Dataset
|
|
|
11
13
|
from ftmq.model.stats import Collector
|
|
12
14
|
from ftmq.query import Query
|
|
13
15
|
from ftmq.store import get_store
|
|
16
|
+
from ftmq.store.fragments import get_fragments
|
|
17
|
+
from ftmq.store.fragments import get_store as get_fragments_store
|
|
18
|
+
from ftmq.store.fragments.settings import Settings as FragmentsSettings
|
|
14
19
|
from ftmq.util import apply_dataset, parse_unknown_filters
|
|
15
20
|
|
|
16
21
|
log = get_logger(__name__)
|
|
@@ -311,6 +316,82 @@ def store_iterate(
|
|
|
311
316
|
smart_write_proxies(output_uri, store.iterate())
|
|
312
317
|
|
|
313
318
|
|
|
319
|
+
@cli.group()
|
|
320
|
+
def fragments():
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
fragments_settings = FragmentsSettings()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@fragments.command("list-datasets")
|
|
328
|
+
@click.option(
|
|
329
|
+
"-i",
|
|
330
|
+
"--input-uri",
|
|
331
|
+
default=fragments_settings.database_uri,
|
|
332
|
+
show_default=True,
|
|
333
|
+
help="input file or uri",
|
|
334
|
+
)
|
|
335
|
+
@click.option(
|
|
336
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
337
|
+
)
|
|
338
|
+
def fragments_list_datasets(
|
|
339
|
+
input_uri: str = fragments_settings.database_uri,
|
|
340
|
+
output_uri: str = "-",
|
|
341
|
+
):
|
|
342
|
+
"""
|
|
343
|
+
List datasets within a fragments store
|
|
344
|
+
"""
|
|
345
|
+
store = get_fragments_store(input_uri)
|
|
346
|
+
datasets = [ds.name for ds in store.all()]
|
|
347
|
+
smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@fragments.command("iterate")
|
|
351
|
+
@click.option(
|
|
352
|
+
"-i",
|
|
353
|
+
"--input-uri",
|
|
354
|
+
default=fragments_settings.database_uri,
|
|
355
|
+
show_default=True,
|
|
356
|
+
help="fragments store input uri",
|
|
357
|
+
)
|
|
358
|
+
@click.option(
|
|
359
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
360
|
+
)
|
|
361
|
+
@click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
|
|
362
|
+
@click.option("-s", "--schema", default=None, help="Filter by schema")
|
|
363
|
+
@click.option(
|
|
364
|
+
"--since",
|
|
365
|
+
default=None,
|
|
366
|
+
help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
367
|
+
)
|
|
368
|
+
@click.option(
|
|
369
|
+
"--until",
|
|
370
|
+
default=None,
|
|
371
|
+
help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
372
|
+
)
|
|
373
|
+
def fragments_iterate(
|
|
374
|
+
input_uri: str = fragments_settings.database_uri,
|
|
375
|
+
output_uri: str = "-",
|
|
376
|
+
dataset: str = None,
|
|
377
|
+
schema: str | None = None,
|
|
378
|
+
since: str | None = None,
|
|
379
|
+
until: str | None = None,
|
|
380
|
+
):
|
|
381
|
+
"""
|
|
382
|
+
Iterate all entities from a fragments dataset
|
|
383
|
+
"""
|
|
384
|
+
fragments = get_fragments(dataset, database_uri=input_uri)
|
|
385
|
+
|
|
386
|
+
# Parse timestamp strings to datetime objects
|
|
387
|
+
since_dt = datetime.fromisoformat(since) if since else None
|
|
388
|
+
until_dt = datetime.fromisoformat(until) if until else None
|
|
389
|
+
|
|
390
|
+
smart_write_proxies(
|
|
391
|
+
output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
314
395
|
@cli.command("aggregate")
|
|
315
396
|
@click.option(
|
|
316
397
|
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from functools import cache
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
from urllib.parse import urlparse
|
|
4
3
|
|
|
4
|
+
from anystore.functools import weakref_cache as cache
|
|
5
5
|
from anystore.types import Uri
|
|
6
6
|
from followthemoney.dataset.dataset import Dataset
|
|
7
7
|
from nomenklatura import Resolver, settings
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from functools import cache
|
|
2
1
|
from typing import Generator, Iterable
|
|
3
2
|
from urllib.parse import urlparse
|
|
4
3
|
|
|
4
|
+
from anystore.functools import weakref_cache as cache
|
|
5
5
|
from followthemoney import DefaultDataset
|
|
6
6
|
from followthemoney.dataset.dataset import Dataset
|
|
7
7
|
from nomenklatura import store as nk
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from typing import Generator, Iterable, TypeAlias
|
|
4
5
|
|
|
@@ -9,6 +10,7 @@ from normality import slugify
|
|
|
9
10
|
from sqlalchemy import (
|
|
10
11
|
JSON,
|
|
11
12
|
Column,
|
|
13
|
+
Connection,
|
|
12
14
|
DateTime,
|
|
13
15
|
String,
|
|
14
16
|
Table,
|
|
@@ -28,16 +30,42 @@ from ftmq.util import make_dataset
|
|
|
28
30
|
log = logging.getLogger(__name__)
|
|
29
31
|
UNDEFINED = (OperationalError,)
|
|
30
32
|
try:
|
|
31
|
-
from
|
|
33
|
+
from psycopg.errors import UndefinedTable
|
|
32
34
|
|
|
33
35
|
UNDEFINED = (UndefinedTable, *UNDEFINED)
|
|
34
36
|
except ImportError:
|
|
35
|
-
|
|
37
|
+
try:
|
|
38
|
+
from psycopg2.errors import UndefinedTable
|
|
39
|
+
|
|
40
|
+
UNDEFINED = (UndefinedTable, *UNDEFINED)
|
|
41
|
+
except ImportError:
|
|
42
|
+
pass
|
|
36
43
|
|
|
37
44
|
|
|
38
45
|
EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
|
|
39
46
|
|
|
40
47
|
|
|
48
|
+
@contextmanager
|
|
49
|
+
def disable_timeout(conn: Connection, store):
|
|
50
|
+
# for long running iterations (e.g. re-index in OpenAleph), for postgres we
|
|
51
|
+
# don't want to get cancelled if a idle_in_transaction_timeout is configured
|
|
52
|
+
# on the server
|
|
53
|
+
if store.is_postgres:
|
|
54
|
+
raw_conn = conn.connection.driver_connection
|
|
55
|
+
with raw_conn.cursor() as cursor:
|
|
56
|
+
cursor.execute("SET idle_in_transaction_session_timeout = 0")
|
|
57
|
+
try:
|
|
58
|
+
yield conn
|
|
59
|
+
finally:
|
|
60
|
+
if store.is_postgres:
|
|
61
|
+
try:
|
|
62
|
+
raw_conn = conn.connection.driver_connection
|
|
63
|
+
with raw_conn.cursor() as cursor:
|
|
64
|
+
cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
|
|
65
|
+
except Exception:
|
|
66
|
+
pass # Connection might be closed
|
|
67
|
+
|
|
68
|
+
|
|
41
69
|
class Fragments(object):
|
|
42
70
|
def __init__(self, store, name, origin=NULL_ORIGIN):
|
|
43
71
|
self.store = store
|
|
@@ -104,7 +132,9 @@ class Fragments(object):
|
|
|
104
132
|
def bulk(self, size=1000):
|
|
105
133
|
return BulkLoader(self, size)
|
|
106
134
|
|
|
107
|
-
def fragments(
|
|
135
|
+
def fragments(
|
|
136
|
+
self, entity_ids=None, fragment=None, schema=None, since=None, until=None
|
|
137
|
+
):
|
|
108
138
|
stmt = self.table.select()
|
|
109
139
|
entity_ids = ensure_list(entity_ids)
|
|
110
140
|
if len(entity_ids) == 1:
|
|
@@ -113,25 +143,42 @@ class Fragments(object):
|
|
|
113
143
|
stmt = stmt.where(self.table.c.id.in_(entity_ids))
|
|
114
144
|
if fragment is not None:
|
|
115
145
|
stmt = stmt.where(self.table.c.fragment == fragment)
|
|
146
|
+
if schema is not None:
|
|
147
|
+
if self.store.is_postgres:
|
|
148
|
+
stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
|
|
149
|
+
else:
|
|
150
|
+
# SQLite JSON support - use json_extract function
|
|
151
|
+
stmt = stmt.where(
|
|
152
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
153
|
+
)
|
|
154
|
+
if since is not None:
|
|
155
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
156
|
+
if until is not None:
|
|
157
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
116
158
|
stmt = stmt.order_by(self.table.c.id)
|
|
117
159
|
# stmt = stmt.order_by(self.table.c.origin)
|
|
118
160
|
# stmt = stmt.order_by(self.table.c.fragment)
|
|
119
161
|
conn = self.store.engine.connect()
|
|
120
162
|
try:
|
|
121
|
-
conn
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
163
|
+
with disable_timeout(conn, self.store) as conn:
|
|
164
|
+
conn = conn.execution_options(stream_results=True)
|
|
165
|
+
for ent in conn.execute(stmt):
|
|
166
|
+
data = {"id": ent.id, "datasets": [self.name], **ent.entity}
|
|
167
|
+
if ent.origin != NULL_ORIGIN:
|
|
168
|
+
data["origin"] = ent.origin
|
|
169
|
+
yield data
|
|
127
170
|
except Exception:
|
|
128
171
|
self.reset()
|
|
129
172
|
raise
|
|
130
173
|
finally:
|
|
131
174
|
conn.close()
|
|
132
175
|
|
|
133
|
-
def partials(
|
|
134
|
-
|
|
176
|
+
def partials(
|
|
177
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
178
|
+
) -> EntityFragments:
|
|
179
|
+
for fragment in self.fragments(
|
|
180
|
+
entity_ids=entity_id, schema=schema, since=since, until=until
|
|
181
|
+
):
|
|
135
182
|
try:
|
|
136
183
|
yield EntityProxy.from_dict(fragment, cleaned=True)
|
|
137
184
|
except Exception:
|
|
@@ -140,18 +187,32 @@ class Fragments(object):
|
|
|
140
187
|
continue
|
|
141
188
|
raise
|
|
142
189
|
|
|
143
|
-
def iterate(
|
|
190
|
+
def iterate(
|
|
191
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
192
|
+
) -> EntityFragments:
|
|
193
|
+
if entity_id is None:
|
|
194
|
+
log.info("Using batched iteration for complete dataset.")
|
|
195
|
+
yield from self.iterate_batched(
|
|
196
|
+
skip_errors=skip_errors, schema=schema, since=since, until=until
|
|
197
|
+
)
|
|
198
|
+
return
|
|
144
199
|
entity = None
|
|
145
200
|
invalid = None
|
|
146
201
|
fragments = 1
|
|
147
|
-
for partial in self.partials(
|
|
202
|
+
for partial in self.partials(
|
|
203
|
+
entity_id=entity_id,
|
|
204
|
+
skip_errors=skip_errors,
|
|
205
|
+
schema=schema,
|
|
206
|
+
since=since,
|
|
207
|
+
until=until,
|
|
208
|
+
):
|
|
148
209
|
if partial.id == invalid:
|
|
149
210
|
continue
|
|
150
211
|
if entity is not None:
|
|
151
212
|
if entity.id == partial.id:
|
|
152
213
|
fragments += 1
|
|
153
214
|
if fragments % 10000 == 0:
|
|
154
|
-
log.
|
|
215
|
+
log.warning(
|
|
155
216
|
"[%s:%s] aggregated %d fragments...",
|
|
156
217
|
entity.schema.name,
|
|
157
218
|
entity.id,
|
|
@@ -176,11 +237,76 @@ class Fragments(object):
|
|
|
176
237
|
if entity is not None:
|
|
177
238
|
yield entity
|
|
178
239
|
|
|
240
|
+
def iterate_batched(
|
|
241
|
+
self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
|
|
242
|
+
) -> EntityFragments:
|
|
243
|
+
"""
|
|
244
|
+
For large datasets an overall sort is not feasible, so we iterate in
|
|
245
|
+
sorted batched IDs.
|
|
246
|
+
"""
|
|
247
|
+
for entity_ids in self.get_sorted_id_batches(
|
|
248
|
+
batch_size, schema=schema, since=since, until=until
|
|
249
|
+
):
|
|
250
|
+
yield from self.iterate(
|
|
251
|
+
entity_id=entity_ids,
|
|
252
|
+
skip_errors=skip_errors,
|
|
253
|
+
schema=schema,
|
|
254
|
+
since=since,
|
|
255
|
+
until=until,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def get_sorted_id_batches(
|
|
259
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
260
|
+
) -> Generator[list[str], None, None]:
|
|
261
|
+
"""
|
|
262
|
+
Get sorted ID batches to speed up iteration and useful to parallelize
|
|
263
|
+
processing of iterator Entities
|
|
264
|
+
"""
|
|
265
|
+
last_id = None
|
|
266
|
+
with self.store.engine.connect() as conn:
|
|
267
|
+
while True:
|
|
268
|
+
stmt = select(self.table.c.id).distinct()
|
|
269
|
+
if last_id is not None:
|
|
270
|
+
stmt = stmt.where(self.table.c.id > last_id)
|
|
271
|
+
if schema is not None:
|
|
272
|
+
if self.store.is_postgres:
|
|
273
|
+
stmt = stmt.where(
|
|
274
|
+
self.table.c.entity["schema"].astext == schema
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# SQLite JSON support - use json_extract function
|
|
278
|
+
stmt = stmt.where(
|
|
279
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
280
|
+
)
|
|
281
|
+
if since is not None:
|
|
282
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
283
|
+
if until is not None:
|
|
284
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
285
|
+
stmt = stmt.order_by(self.table.c.id).limit(batch_size)
|
|
286
|
+
try:
|
|
287
|
+
res = conn.execute(stmt)
|
|
288
|
+
entity_ids = [r.id for r in res.fetchall()]
|
|
289
|
+
if not entity_ids:
|
|
290
|
+
return
|
|
291
|
+
yield entity_ids
|
|
292
|
+
last_id = entity_ids[-1]
|
|
293
|
+
except Exception:
|
|
294
|
+
self.reset()
|
|
295
|
+
raise
|
|
296
|
+
|
|
297
|
+
def get_sorted_ids(
|
|
298
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
299
|
+
) -> Generator[str, None, None]:
|
|
300
|
+
"""Get sorted IDs, optionally filtered by schema"""
|
|
301
|
+
for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
|
|
302
|
+
yield from batch
|
|
303
|
+
|
|
179
304
|
def statements(
|
|
180
305
|
self,
|
|
181
306
|
entity_ids: Iterable[str] | None = None,
|
|
182
307
|
origin: str | None = None,
|
|
183
308
|
since: datetime | None = None,
|
|
309
|
+
until: datetime | None = None,
|
|
184
310
|
) -> Statements:
|
|
185
311
|
"""Iterate unsorted statements with its fragment origins"""
|
|
186
312
|
stmt = self.table.select()
|
|
@@ -192,7 +318,9 @@ class Fragments(object):
|
|
|
192
318
|
if origin is not None:
|
|
193
319
|
stmt = stmt.where(self.table.c.origin == origin)
|
|
194
320
|
if since is not None:
|
|
195
|
-
stmt = stmt.where(self.table.c.timestamp
|
|
321
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
322
|
+
if until is not None:
|
|
323
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
196
324
|
conn = self.store.engine.connect()
|
|
197
325
|
default_dataset = make_dataset(self.name)
|
|
198
326
|
try:
|
|
@@ -26,11 +26,16 @@ EXCEPTIONS = (
|
|
|
26
26
|
TimeoutError,
|
|
27
27
|
)
|
|
28
28
|
try:
|
|
29
|
-
from
|
|
29
|
+
from psycopg import DatabaseError, OperationalError
|
|
30
30
|
|
|
31
31
|
EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
|
|
32
32
|
except ImportError:
|
|
33
|
-
|
|
33
|
+
try:
|
|
34
|
+
from psycopg2 import DatabaseError, OperationalError
|
|
35
|
+
|
|
36
|
+
EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
|
|
37
|
+
except ImportError:
|
|
38
|
+
pass
|
|
34
39
|
|
|
35
40
|
log = logging.getLogger(__name__)
|
|
36
41
|
|
|
@@ -50,9 +55,12 @@ class BulkLoader(object):
|
|
|
50
55
|
else:
|
|
51
56
|
entity = dict(entity)
|
|
52
57
|
id_ = entity.pop("id")
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
self.
|
|
58
|
+
if id_:
|
|
59
|
+
self.buffer[(id_, origin, fragment)] = entity
|
|
60
|
+
if len(self.buffer) >= self.size:
|
|
61
|
+
self.flush()
|
|
62
|
+
else:
|
|
63
|
+
log.warning("Entity has no ID!")
|
|
56
64
|
|
|
57
65
|
def _store_values(self, conn, values):
|
|
58
66
|
table = self.dataset.table
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from sqlalchemy import MetaData, create_engine
|
|
2
|
+
from sqlalchemy import inspect as sqlalchemy_inspect
|
|
3
|
+
|
|
4
|
+
from ftmq.store.fragments.dataset import Fragments
|
|
5
|
+
from ftmq.store.fragments.utils import NULL_ORIGIN
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Store(object):
|
|
9
|
+
"""A database containing multiple tables that represent
|
|
10
|
+
FtM-store datasets."""
|
|
11
|
+
|
|
12
|
+
PREFIX = "ftm"
|
|
13
|
+
|
|
14
|
+
def _adjust_psycopg3_uri(self, database_uri: str) -> str:
|
|
15
|
+
"""Adjust PostgreSQL URI to use psycopg3 dialect if psycopg is available."""
|
|
16
|
+
if database_uri.startswith(("postgresql://", "postgres://")):
|
|
17
|
+
try:
|
|
18
|
+
import psycopg # noqa: F401
|
|
19
|
+
|
|
20
|
+
# Use psycopg3 dialect for better performance and compatibility
|
|
21
|
+
if database_uri.startswith("postgresql://"):
|
|
22
|
+
return database_uri.replace(
|
|
23
|
+
"postgresql://", "postgresql+psycopg://", 1
|
|
24
|
+
)
|
|
25
|
+
elif database_uri.startswith("postgres://"):
|
|
26
|
+
return database_uri.replace(
|
|
27
|
+
"postgres://", "postgresql+psycopg://", 1
|
|
28
|
+
)
|
|
29
|
+
except ImportError:
|
|
30
|
+
# Fall back to psycopg2 if psycopg3 is not available
|
|
31
|
+
pass
|
|
32
|
+
return database_uri
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
database_uri: str,
|
|
37
|
+
**config,
|
|
38
|
+
):
|
|
39
|
+
self.database_uri = self._adjust_psycopg3_uri(database_uri)
|
|
40
|
+
|
|
41
|
+
# Configure connection pooling for psycopg3
|
|
42
|
+
config.setdefault("pool_size", 1)
|
|
43
|
+
if self.database_uri.startswith("postgresql+psycopg://"):
|
|
44
|
+
config.setdefault("max_overflow", 5)
|
|
45
|
+
config.setdefault("pool_timeout", 60)
|
|
46
|
+
config.setdefault("pool_recycle", 3600)
|
|
47
|
+
config.setdefault("pool_pre_ping", True)
|
|
48
|
+
|
|
49
|
+
self.engine = create_engine(self.database_uri, future=True, **config)
|
|
50
|
+
self.is_postgres = self.engine.dialect.name == "postgresql"
|
|
51
|
+
self.meta = MetaData()
|
|
52
|
+
|
|
53
|
+
def get(self, name, origin=NULL_ORIGIN):
|
|
54
|
+
return Fragments(self, name, origin=origin)
|
|
55
|
+
|
|
56
|
+
def all(self, origin=NULL_ORIGIN):
|
|
57
|
+
prefix = f"{self.PREFIX}_"
|
|
58
|
+
inspect = sqlalchemy_inspect(self.engine)
|
|
59
|
+
for table in inspect.get_table_names():
|
|
60
|
+
if table.startswith(prefix):
|
|
61
|
+
name = table[len(prefix) :]
|
|
62
|
+
yield Fragments(self, name, origin=origin)
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
self.engine.dispose()
|
|
66
|
+
|
|
67
|
+
def __len__(self):
|
|
68
|
+
return len(list(self.all()))
|
|
69
|
+
|
|
70
|
+
def __repr__(self):
|
|
71
|
+
return "<Store(%r)>" % self.engine
|
|
@@ -18,7 +18,6 @@ Layout:
|
|
|
18
18
|
```
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
from functools import cache
|
|
22
21
|
from pathlib import Path
|
|
23
22
|
from typing import Any, Generator, Iterable
|
|
24
23
|
from urllib.parse import urlparse
|
|
@@ -26,6 +25,7 @@ from urllib.parse import urlparse
|
|
|
26
25
|
import duckdb
|
|
27
26
|
import numpy as np
|
|
28
27
|
import pandas as pd
|
|
28
|
+
from anystore.functools import weakref_cache as cache
|
|
29
29
|
from anystore.lock import Lock
|
|
30
30
|
from anystore.logging import get_logger
|
|
31
31
|
from anystore.store.fs import Store as FSStore
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
from functools import
|
|
2
|
-
from typing import Any, Generator,
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import Any, Generator, Type
|
|
3
3
|
|
|
4
4
|
import pycountry
|
|
5
|
-
from anystore.
|
|
5
|
+
from anystore.functools import weakref_cache as cache
|
|
6
|
+
from anystore.types import SDict, StrGenerator
|
|
6
7
|
from banal import ensure_list, is_listish
|
|
7
|
-
from followthemoney import E
|
|
8
|
+
from followthemoney import E, model
|
|
8
9
|
from followthemoney.compare import _normalize_names
|
|
9
10
|
from followthemoney.dataset import Dataset
|
|
10
11
|
from followthemoney.entity import ValueEntity
|
|
@@ -12,7 +13,10 @@ from followthemoney.proxy import EntityProxy
|
|
|
12
13
|
from followthemoney.schema import Schema
|
|
13
14
|
from followthemoney.types import registry
|
|
14
15
|
from followthemoney.util import make_entity_id, sanitize_text
|
|
15
|
-
from normality import
|
|
16
|
+
from normality import latinize_text, slugify, squash_spaces
|
|
17
|
+
from rigour.names import Name, Symbol, tag_org_name, tag_person_name
|
|
18
|
+
from rigour.names.tokenize import normalize_name
|
|
19
|
+
from rigour.text.scripts import can_latinize
|
|
16
20
|
|
|
17
21
|
from ftmq.enums import Comparators
|
|
18
22
|
from ftmq.types import Entity
|
|
@@ -317,7 +321,7 @@ def clean_string(value: Any) -> str | None:
|
|
|
317
321
|
value = sanitize_text(value)
|
|
318
322
|
if value is None:
|
|
319
323
|
return
|
|
320
|
-
return
|
|
324
|
+
return squash_spaces(value)
|
|
321
325
|
|
|
322
326
|
|
|
323
327
|
def clean_name(value: Any) -> str | None:
|
|
@@ -377,18 +381,21 @@ def make_fingerprint(value: Any) -> str | None:
|
|
|
377
381
|
|
|
378
382
|
|
|
379
383
|
def entity_fingerprints(entity: EntityProxy) -> set[str]:
|
|
380
|
-
"""Get the set of entity name fingerprints
|
|
381
|
-
|
|
382
|
-
return
|
|
384
|
+
"""Get the set of entity name fingerprints, latinized if the alphabet allows
|
|
385
|
+
it and with org / person tags removed depending on entity schema"""
|
|
386
|
+
return make_fingerprints(*entity.names, schemata={entity.schema})
|
|
383
387
|
|
|
384
388
|
|
|
385
|
-
def make_fingerprints(schemata: set[Schema]
|
|
386
|
-
"""
|
|
389
|
+
def make_fingerprints(*names: str, schemata: set[Schema] | None = None) -> set[str]:
|
|
390
|
+
"""Get the set of name fingerprints, latinized if the alphabet allows
|
|
391
|
+
it and with org / person tags removed depending on given schemata"""
|
|
387
392
|
# FIXME private import
|
|
393
|
+
schemata = schemata or {model["LegalEntity"]}
|
|
388
394
|
fps: set[str] = set()
|
|
389
395
|
for schema in schemata:
|
|
390
396
|
fps.update(set(_normalize_names(schema, names)))
|
|
391
|
-
|
|
397
|
+
# add latinized if appropriate
|
|
398
|
+
return {latinize_text(fp) if can_latinize(fp) else fp for fp in fps}
|
|
392
399
|
|
|
393
400
|
|
|
394
401
|
def make_string_id(*values: Any) -> str | None:
|
|
@@ -476,3 +483,64 @@ def must_str(value: Any) -> str:
|
|
|
476
483
|
if not value:
|
|
477
484
|
raise ValueError(f"Value invalid: `{value}`")
|
|
478
485
|
return value
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
SELECT_SYMBOLS = "__symbols__"
|
|
489
|
+
SELECT_ANNOTATED = "__annotated__"
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def get_name_symbols(schema: Schema, *names: str) -> set[Symbol]:
|
|
493
|
+
"""Get the rigour names symbols for the given schema and list of names"""
|
|
494
|
+
symbols: set[Symbol] = set()
|
|
495
|
+
if schema.is_a("Person"):
|
|
496
|
+
taggers = [tag_person_name]
|
|
497
|
+
elif schema.is_a("Organization"):
|
|
498
|
+
taggers = [tag_org_name]
|
|
499
|
+
elif schema.is_a("LegalEntity"):
|
|
500
|
+
taggers = [tag_org_name, tag_person_name]
|
|
501
|
+
else:
|
|
502
|
+
return symbols
|
|
503
|
+
for name in names:
|
|
504
|
+
n = Name(name)
|
|
505
|
+
for tagger in taggers:
|
|
506
|
+
for symbol in tagger(n, normalize_name).symbols:
|
|
507
|
+
symbols.add(symbol)
|
|
508
|
+
return symbols
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def get_symbols(entity: EntityProxy) -> set[Symbol]:
|
|
512
|
+
"""Get the rigour names symbols for the given entity"""
|
|
513
|
+
if not entity.schema.is_a("LegalEntity"):
|
|
514
|
+
return set()
|
|
515
|
+
names = entity.get_type_values(registry.name, matchable=True)
|
|
516
|
+
return get_name_symbols(entity.schema, *names)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def inline_symbols(entity: EntityProxy) -> None:
|
|
520
|
+
"""Get the rigour names symbols for the given entity and write them to `indexText`"""
|
|
521
|
+
# clean up old symbols from indexText:
|
|
522
|
+
for text in entity.pop("indexText"):
|
|
523
|
+
if not text.startswith(SELECT_SYMBOLS):
|
|
524
|
+
entity.add("indexText", text)
|
|
525
|
+
symbols = get_symbols(entity)
|
|
526
|
+
entity.add("indexText", f"{SELECT_SYMBOLS} {','.join(map(str, symbols))}")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def select_data(e: EntityProxy, prefix: str) -> StrGenerator:
|
|
530
|
+
"""Select arbitrary stored data in `indexText` identified by given prefix"""
|
|
531
|
+
for text in e.get("indexText", quiet=True):
|
|
532
|
+
if text.startswith(prefix):
|
|
533
|
+
yield text.replace(prefix, "").strip()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def select_symbols(e: EntityProxy) -> set[str]:
|
|
537
|
+
"""Select stored symbols in `indexText`"""
|
|
538
|
+
symbols: set[str] = set()
|
|
539
|
+
for data in select_data(e, SELECT_SYMBOLS):
|
|
540
|
+
symbols.update(data.split(","))
|
|
541
|
+
return symbols
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def select_annotations(e: EntityProxy) -> set[str]:
|
|
545
|
+
"""Select stored annotations in `indexText`"""
|
|
546
|
+
return {s for s in select_data(e, SELECT_ANNOTATED)}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ftmq"
|
|
3
|
-
version = "4.
|
|
3
|
+
version = "4.3.2"
|
|
4
4
|
description = "followthemoney query dsl and io helpers"
|
|
5
5
|
authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
|
|
6
6
|
license = "AGPLv3+"
|
|
@@ -15,9 +15,10 @@ classifiers = [
|
|
|
15
15
|
]
|
|
16
16
|
requires-python = ">=3.11,<4"
|
|
17
17
|
dependencies = [
|
|
18
|
-
"anystore (>=0.
|
|
19
|
-
"followthemoney (>=4.
|
|
20
|
-
"nomenklatura (>=4.1.
|
|
18
|
+
"anystore (>=0.4.0,<0.5.0)",
|
|
19
|
+
"followthemoney (>=4.3.2,<5.0.0)",
|
|
20
|
+
"nomenklatura (>=4.1.10,<5.0.0)",
|
|
21
|
+
"rigour (>=1.4.1,<2.0.0)",
|
|
21
22
|
"click (>=8.2.1,<9.0.0)",
|
|
22
23
|
"click-default-group (>=1.2.4,<2.0.0)",
|
|
23
24
|
"orjson (>=3.10.18,<4.0.0)",
|
|
@@ -29,12 +30,12 @@ dependencies = [
|
|
|
29
30
|
[project.optional-dependencies]
|
|
30
31
|
level = ["plyvel (>=1.5.1,<2.0.0)"]
|
|
31
32
|
sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
|
|
32
|
-
postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "
|
|
33
|
+
postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
|
|
33
34
|
redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
|
|
34
35
|
lake = [
|
|
35
|
-
"duckdb (>=1.
|
|
36
|
-
"pandas (>=2.3.
|
|
37
|
-
"deltalake (>=1.1
|
|
36
|
+
"duckdb (>=1.4.1,<2.0.0)",
|
|
37
|
+
"pandas (>=2.3.3,<3.0.0)",
|
|
38
|
+
"deltalake (>=1.2.1,<2.0.0)",
|
|
38
39
|
"pyarrow (>=21.0.0,<22.0.0)",
|
|
39
40
|
]
|
|
40
41
|
aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
|
|
@@ -50,19 +51,19 @@ Issues = "https://github.com/dataresearchcenter/ftmq/issues"
|
|
|
50
51
|
|
|
51
52
|
[tool.poetry.group.dev.dependencies]
|
|
52
53
|
pytest = ">=7.4.3,<9.0.0"
|
|
53
|
-
pytest-cov = ">=4.1,<
|
|
54
|
+
pytest-cov = ">=4.1,<8.0"
|
|
54
55
|
pytest-env = "^1.1.1"
|
|
55
56
|
black = ">=23.11,<26.0"
|
|
56
|
-
isort = "^
|
|
57
|
+
isort = "^7.0.0"
|
|
57
58
|
mypy = "^1.17.1"
|
|
58
59
|
pre-commit = "^4.0.1"
|
|
59
60
|
flake8 = ">=6.1,<8.0"
|
|
60
61
|
ipdb = "^0.13.13"
|
|
61
62
|
bump2version = "^1.0.1"
|
|
62
63
|
mkdocs = "^1.6.1"
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
mkdocs-material = "^9.6.
|
|
64
|
+
mkdocs-autorefs = "^1.4.3"
|
|
65
|
+
mkdocstrings-python = "^1.18.2"
|
|
66
|
+
mkdocs-material = "^9.6.18"
|
|
66
67
|
mkdocs-click = "^0.9.0"
|
|
67
68
|
|
|
68
69
|
[build-system]
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from sqlalchemy import MetaData, create_engine
|
|
2
|
-
from sqlalchemy import inspect as sqlalchemy_inspect
|
|
3
|
-
|
|
4
|
-
from ftmq.store.fragments.dataset import Fragments
|
|
5
|
-
from ftmq.store.fragments.utils import NULL_ORIGIN
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Store(object):
|
|
9
|
-
"""A database containing multiple tables that represent
|
|
10
|
-
FtM-store datasets."""
|
|
11
|
-
|
|
12
|
-
PREFIX = "ftm"
|
|
13
|
-
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
database_uri: str,
|
|
17
|
-
**config,
|
|
18
|
-
):
|
|
19
|
-
self.database_uri = database_uri
|
|
20
|
-
# config.setdefault('pool_size', 1)
|
|
21
|
-
self.engine = create_engine(database_uri, future=True, **config)
|
|
22
|
-
self.is_postgres = self.engine.dialect.name == "postgresql"
|
|
23
|
-
self.meta = MetaData()
|
|
24
|
-
|
|
25
|
-
def get(self, name, origin=NULL_ORIGIN):
|
|
26
|
-
return Fragments(self, name, origin=origin)
|
|
27
|
-
|
|
28
|
-
def all(self, origin=NULL_ORIGIN):
|
|
29
|
-
prefix = f"{self.PREFIX}_"
|
|
30
|
-
inspect = sqlalchemy_inspect(self.engine)
|
|
31
|
-
for table in inspect.get_table_names():
|
|
32
|
-
if table.startswith(prefix):
|
|
33
|
-
name = table[len(prefix) :]
|
|
34
|
-
yield Fragments(self, name, origin=origin)
|
|
35
|
-
|
|
36
|
-
def close(self):
|
|
37
|
-
self.engine.dispose()
|
|
38
|
-
|
|
39
|
-
def __len__(self):
|
|
40
|
-
return len(list(self.all()))
|
|
41
|
-
|
|
42
|
-
def __repr__(self):
|
|
43
|
-
return "<Store(%r)>" % self.engine
|
|
File without changes
|
{ftmq-4.1.1 → ftmq-4.3.2}/NOTICE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|