ftmq 4.1.0__tar.gz → 4.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ftmq-4.1.0 → ftmq-4.3.1}/PKG-INFO +12 -9
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/__init__.py +1 -1
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/cli.py +81 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/filters.py +16 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/io.py +0 -2
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/dataset.py +1 -1
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/query.py +21 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/sql.py +13 -3
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/__init__.py +1 -1
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/base.py +1 -1
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/__init__.py +1 -1
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/dataset.py +151 -21
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/loader.py +13 -5
- ftmq-4.3.1/ftmq/store/fragments/store.py +71 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/lake.py +58 -31
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/types.py +0 -4
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/util.py +90 -4
- {ftmq-4.1.0 → ftmq-4.3.1}/pyproject.toml +15 -14
- ftmq-4.1.0/ftmq/store/fragments/store.py +0 -43
- {ftmq-4.1.0 → ftmq-4.3.1}/LICENSE +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/NOTICE +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/README.md +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/aggregate.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/aggregations.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/enums.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/logging.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/__init__.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/entity.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/mixins.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/stats.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/similar.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/aleph.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/settings.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/utils.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/level.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/memory.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/redis.py +0 -0
- {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/sql.py +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ftmq
|
|
3
|
-
Version: 4.1
|
|
3
|
+
Version: 4.3.1
|
|
4
4
|
Summary: followthemoney query dsl and io helpers
|
|
5
5
|
License: AGPLv3+
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: NOTICE
|
|
6
8
|
Author: Simon Wörpel
|
|
7
9
|
Author-email: simon.woerpel@pm.me
|
|
8
10
|
Requires-Python: >=3.11,<4
|
|
@@ -19,24 +21,25 @@ Provides-Extra: postgres
|
|
|
19
21
|
Provides-Extra: redis
|
|
20
22
|
Provides-Extra: sql
|
|
21
23
|
Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
|
|
22
|
-
Requires-Dist: anystore (>=0.
|
|
24
|
+
Requires-Dist: anystore (>=0.4.0,<0.5.0)
|
|
23
25
|
Requires-Dist: click (>=8.2.1,<9.0.0)
|
|
24
26
|
Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
|
|
25
|
-
Requires-Dist: deltalake (>=1.1
|
|
26
|
-
Requires-Dist: duckdb (>=1.
|
|
27
|
+
Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
|
|
28
|
+
Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
|
|
27
29
|
Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
|
|
28
|
-
Requires-Dist: followthemoney (>=4.
|
|
30
|
+
Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
|
|
29
31
|
Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
|
|
30
|
-
Requires-Dist: nomenklatura (>=4.1.
|
|
32
|
+
Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
|
|
31
33
|
Requires-Dist: orjson (>=3.10.18,<4.0.0)
|
|
32
|
-
Requires-Dist: pandas (>=2.3.
|
|
34
|
+
Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
|
|
33
35
|
Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
|
|
34
|
-
Requires-Dist:
|
|
36
|
+
Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
|
|
35
37
|
Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
|
|
36
38
|
Requires-Dist: pycountry (>=24.6.1,<25.0.0)
|
|
37
39
|
Requires-Dist: pydantic (>=2.11.3,<3.0.0)
|
|
38
40
|
Requires-Dist: pyicu (>=2.15.2,<3.0.0)
|
|
39
41
|
Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
|
|
42
|
+
Requires-Dist: rigour (>=1.4.0,<2.0.0)
|
|
40
43
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
|
|
41
44
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
|
|
42
45
|
Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
1
3
|
import click
|
|
2
4
|
from anystore.io import smart_write, smart_write_json, smart_write_model
|
|
3
5
|
from click_default_group import DefaultGroup
|
|
@@ -11,6 +13,9 @@ from ftmq.model.dataset import Catalog, Dataset
|
|
|
11
13
|
from ftmq.model.stats import Collector
|
|
12
14
|
from ftmq.query import Query
|
|
13
15
|
from ftmq.store import get_store
|
|
16
|
+
from ftmq.store.fragments import get_fragments
|
|
17
|
+
from ftmq.store.fragments import get_store as get_fragments_store
|
|
18
|
+
from ftmq.store.fragments.settings import Settings as FragmentsSettings
|
|
14
19
|
from ftmq.util import apply_dataset, parse_unknown_filters
|
|
15
20
|
|
|
16
21
|
log = get_logger(__name__)
|
|
@@ -311,6 +316,82 @@ def store_iterate(
|
|
|
311
316
|
smart_write_proxies(output_uri, store.iterate())
|
|
312
317
|
|
|
313
318
|
|
|
319
|
+
@cli.group()
|
|
320
|
+
def fragments():
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
fragments_settings = FragmentsSettings()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@fragments.command("list-datasets")
|
|
328
|
+
@click.option(
|
|
329
|
+
"-i",
|
|
330
|
+
"--input-uri",
|
|
331
|
+
default=fragments_settings.database_uri,
|
|
332
|
+
show_default=True,
|
|
333
|
+
help="input file or uri",
|
|
334
|
+
)
|
|
335
|
+
@click.option(
|
|
336
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
337
|
+
)
|
|
338
|
+
def fragments_list_datasets(
|
|
339
|
+
input_uri: str = fragments_settings.database_uri,
|
|
340
|
+
output_uri: str = "-",
|
|
341
|
+
):
|
|
342
|
+
"""
|
|
343
|
+
List datasets within a fragments store
|
|
344
|
+
"""
|
|
345
|
+
store = get_fragments_store(input_uri)
|
|
346
|
+
datasets = [ds.name for ds in store.all()]
|
|
347
|
+
smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@fragments.command("iterate")
|
|
351
|
+
@click.option(
|
|
352
|
+
"-i",
|
|
353
|
+
"--input-uri",
|
|
354
|
+
default=fragments_settings.database_uri,
|
|
355
|
+
show_default=True,
|
|
356
|
+
help="fragments store input uri",
|
|
357
|
+
)
|
|
358
|
+
@click.option(
|
|
359
|
+
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
|
|
360
|
+
)
|
|
361
|
+
@click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
|
|
362
|
+
@click.option("-s", "--schema", default=None, help="Filter by schema")
|
|
363
|
+
@click.option(
|
|
364
|
+
"--since",
|
|
365
|
+
default=None,
|
|
366
|
+
help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
367
|
+
)
|
|
368
|
+
@click.option(
|
|
369
|
+
"--until",
|
|
370
|
+
default=None,
|
|
371
|
+
help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
|
|
372
|
+
)
|
|
373
|
+
def fragments_iterate(
|
|
374
|
+
input_uri: str = fragments_settings.database_uri,
|
|
375
|
+
output_uri: str = "-",
|
|
376
|
+
dataset: str = None,
|
|
377
|
+
schema: str | None = None,
|
|
378
|
+
since: str | None = None,
|
|
379
|
+
until: str | None = None,
|
|
380
|
+
):
|
|
381
|
+
"""
|
|
382
|
+
Iterate all entities from a fragments dataset
|
|
383
|
+
"""
|
|
384
|
+
fragments = get_fragments(dataset, database_uri=input_uri)
|
|
385
|
+
|
|
386
|
+
# Parse timestamp strings to datetime objects
|
|
387
|
+
since_dt = datetime.fromisoformat(since) if since else None
|
|
388
|
+
until_dt = datetime.fromisoformat(until) if until else None
|
|
389
|
+
|
|
390
|
+
smart_write_proxies(
|
|
391
|
+
output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
314
395
|
@cli.command("aggregate")
|
|
315
396
|
@click.option(
|
|
316
397
|
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
|
|
@@ -125,6 +125,21 @@ class DatasetFilter(BaseFilter):
|
|
|
125
125
|
return False
|
|
126
126
|
|
|
127
127
|
|
|
128
|
+
class OriginFilter(BaseFilter):
|
|
129
|
+
key = "origin"
|
|
130
|
+
|
|
131
|
+
def apply(self, entity: Entity) -> bool:
|
|
132
|
+
if not hasattr(entity, "context"):
|
|
133
|
+
return False
|
|
134
|
+
origins = ensure_list(entity.context.get("origin"))
|
|
135
|
+
if self.comparator == Lookup.EQUALS:
|
|
136
|
+
return self.value in origins
|
|
137
|
+
for value in origins:
|
|
138
|
+
if self.lookup.apply(value):
|
|
139
|
+
return True
|
|
140
|
+
return False
|
|
141
|
+
|
|
142
|
+
|
|
128
143
|
class SchemaFilter(BaseFilter):
|
|
129
144
|
key = "schema"
|
|
130
145
|
|
|
@@ -232,4 +247,5 @@ FILTERS = {
|
|
|
232
247
|
"reverse": ReverseFilter,
|
|
233
248
|
"entity_id": EntityIdFilter,
|
|
234
249
|
"canonical_id": CanonicalIdFilter,
|
|
250
|
+
"origin": OriginFilter,
|
|
235
251
|
}
|
|
@@ -19,7 +19,7 @@ class Dataset(BaseModel, _DatasetModel):
|
|
|
19
19
|
maintainer: DataPublisher | None = None
|
|
20
20
|
stats: DatasetStats = DatasetStats()
|
|
21
21
|
git_repo: AnyUrl | None = None
|
|
22
|
-
content_type: ContentType | None =
|
|
22
|
+
content_type: ContentType | None = None
|
|
23
23
|
uri: str | None = None
|
|
24
24
|
|
|
25
25
|
def iterate(self) -> Entities:
|
|
@@ -4,6 +4,7 @@ from typing import Any, Self, TypeVar
|
|
|
4
4
|
|
|
5
5
|
from banal import ensure_list, is_listish, is_mapping
|
|
6
6
|
from followthemoney import registry
|
|
7
|
+
from sqlalchemy import Table
|
|
7
8
|
|
|
8
9
|
from ftmq.aggregations import Aggregation, Aggregator
|
|
9
10
|
from ftmq.enums import Aggregations, Properties
|
|
@@ -12,6 +13,7 @@ from ftmq.filters import (
|
|
|
12
13
|
DatasetFilter,
|
|
13
14
|
F,
|
|
14
15
|
IdFilter,
|
|
16
|
+
OriginFilter,
|
|
15
17
|
PropertyFilter,
|
|
16
18
|
ReverseFilter,
|
|
17
19
|
SchemaFilter,
|
|
@@ -57,12 +59,14 @@ class Query:
|
|
|
57
59
|
aggregator: Aggregator | None = None,
|
|
58
60
|
sort: Sort | None = None,
|
|
59
61
|
slice: Slice | None = None,
|
|
62
|
+
table: Table | None = None,
|
|
60
63
|
):
|
|
61
64
|
self.filters = set(ensure_list(filters))
|
|
62
65
|
self.aggregations = set(ensure_list(aggregations))
|
|
63
66
|
self.aggregator = aggregator
|
|
64
67
|
self.sort = sort
|
|
65
68
|
self.slice = slice
|
|
69
|
+
self.table = table
|
|
66
70
|
|
|
67
71
|
def __getitem__(self, value: Any) -> Self:
|
|
68
72
|
"""
|
|
@@ -213,6 +217,23 @@ class Query:
|
|
|
213
217
|
names.update(ensure_list([s.name for s in f.schemata]))
|
|
214
218
|
return names
|
|
215
219
|
|
|
220
|
+
@property
|
|
221
|
+
def origins(self) -> set[OriginFilter]:
|
|
222
|
+
"""
|
|
223
|
+
The current filtered origins
|
|
224
|
+
"""
|
|
225
|
+
return {f for f in self.filters if isinstance(f, OriginFilter)}
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def origin_names(self) -> set[str]:
|
|
229
|
+
"""
|
|
230
|
+
The names of the current filtered origins
|
|
231
|
+
"""
|
|
232
|
+
names = set()
|
|
233
|
+
for f in self.origins:
|
|
234
|
+
names.update(ensure_list(f.value))
|
|
235
|
+
return names
|
|
236
|
+
|
|
216
237
|
@property
|
|
217
238
|
def countries(self) -> set[str]:
|
|
218
239
|
"""
|
|
@@ -32,7 +32,7 @@ from ftmq.enums import (
|
|
|
32
32
|
from ftmq.filters import F
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
35
|
-
from ftmq.query import
|
|
35
|
+
from ftmq.query import Query
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
Field: TypeAlias = Properties | PropertyTypes | Fields
|
|
@@ -50,10 +50,13 @@ class Sql:
|
|
|
50
50
|
Comparators.lte: "__le__",
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
-
def __init__(self, q: "
|
|
53
|
+
def __init__(self, q: "Query") -> None:
|
|
54
54
|
self.q = q
|
|
55
55
|
self.metadata = MetaData()
|
|
56
|
-
|
|
56
|
+
if q.table is None:
|
|
57
|
+
self.table = make_statement_table(self.metadata)
|
|
58
|
+
else:
|
|
59
|
+
self.table = q.table
|
|
57
60
|
self.META_COLUMNS = {
|
|
58
61
|
"id": self.table.c.canonical_id,
|
|
59
62
|
"dataset": self.table.c.dataset,
|
|
@@ -92,6 +95,13 @@ class Sql:
|
|
|
92
95
|
for f in sorted(self.q.schemata)
|
|
93
96
|
)
|
|
94
97
|
)
|
|
98
|
+
if self.q.origins:
|
|
99
|
+
clauses.append(
|
|
100
|
+
or_(
|
|
101
|
+
self.get_expression(self.table.c.origin, f)
|
|
102
|
+
for f in sorted(self.q.origins)
|
|
103
|
+
)
|
|
104
|
+
)
|
|
95
105
|
if self.q.reversed:
|
|
96
106
|
rclause = or_(
|
|
97
107
|
and_(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from functools import cache
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
from urllib.parse import urlparse
|
|
4
3
|
|
|
4
|
+
from anystore.functools import weakref_cache as cache
|
|
5
5
|
from anystore.types import Uri
|
|
6
6
|
from followthemoney.dataset.dataset import Dataset
|
|
7
7
|
from nomenklatura import Resolver, settings
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from functools import cache
|
|
2
1
|
from typing import Generator, Iterable
|
|
3
2
|
from urllib.parse import urlparse
|
|
4
3
|
|
|
4
|
+
from anystore.functools import weakref_cache as cache
|
|
5
5
|
from followthemoney import DefaultDataset
|
|
6
6
|
from followthemoney.dataset.dataset import Dataset
|
|
7
7
|
from nomenklatura import store as nk
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
from typing import Generator, Iterable, TypeAlias
|
|
4
5
|
|
|
@@ -9,6 +10,7 @@ from normality import slugify
|
|
|
9
10
|
from sqlalchemy import (
|
|
10
11
|
JSON,
|
|
11
12
|
Column,
|
|
13
|
+
Connection,
|
|
12
14
|
DateTime,
|
|
13
15
|
String,
|
|
14
16
|
Table,
|
|
@@ -22,22 +24,48 @@ from sqlalchemy.exc import OperationalError
|
|
|
22
24
|
|
|
23
25
|
from ftmq.store.fragments.loader import BulkLoader
|
|
24
26
|
from ftmq.store.fragments.utils import NULL_ORIGIN
|
|
25
|
-
from ftmq.types import
|
|
27
|
+
from ftmq.types import Statements
|
|
26
28
|
from ftmq.util import make_dataset
|
|
27
29
|
|
|
28
30
|
log = logging.getLogger(__name__)
|
|
29
31
|
UNDEFINED = (OperationalError,)
|
|
30
32
|
try:
|
|
31
|
-
from
|
|
33
|
+
from psycopg.errors import UndefinedTable
|
|
32
34
|
|
|
33
35
|
UNDEFINED = (UndefinedTable, *UNDEFINED)
|
|
34
36
|
except ImportError:
|
|
35
|
-
|
|
37
|
+
try:
|
|
38
|
+
from psycopg2.errors import UndefinedTable
|
|
39
|
+
|
|
40
|
+
UNDEFINED = (UndefinedTable, *UNDEFINED)
|
|
41
|
+
except ImportError:
|
|
42
|
+
pass
|
|
36
43
|
|
|
37
44
|
|
|
38
45
|
EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
|
|
39
46
|
|
|
40
47
|
|
|
48
|
+
@contextmanager
|
|
49
|
+
def disable_timeout(conn: Connection, store):
|
|
50
|
+
# for long running iterations (e.g. re-index in OpenAleph), for postgres we
|
|
51
|
+
# don't want to get cancelled if a idle_in_transaction_timeout is configured
|
|
52
|
+
# on the server
|
|
53
|
+
if store.is_postgres:
|
|
54
|
+
raw_conn = conn.connection.driver_connection
|
|
55
|
+
with raw_conn.cursor() as cursor:
|
|
56
|
+
cursor.execute("SET idle_in_transaction_session_timeout = 0")
|
|
57
|
+
try:
|
|
58
|
+
yield conn
|
|
59
|
+
finally:
|
|
60
|
+
if store.is_postgres:
|
|
61
|
+
try:
|
|
62
|
+
raw_conn = conn.connection.driver_connection
|
|
63
|
+
with raw_conn.cursor() as cursor:
|
|
64
|
+
cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
|
|
65
|
+
except Exception:
|
|
66
|
+
pass # Connection might be closed
|
|
67
|
+
|
|
68
|
+
|
|
41
69
|
class Fragments(object):
|
|
42
70
|
def __init__(self, store, name, origin=NULL_ORIGIN):
|
|
43
71
|
self.store = store
|
|
@@ -104,7 +132,9 @@ class Fragments(object):
|
|
|
104
132
|
def bulk(self, size=1000):
|
|
105
133
|
return BulkLoader(self, size)
|
|
106
134
|
|
|
107
|
-
def fragments(
|
|
135
|
+
def fragments(
|
|
136
|
+
self, entity_ids=None, fragment=None, schema=None, since=None, until=None
|
|
137
|
+
):
|
|
108
138
|
stmt = self.table.select()
|
|
109
139
|
entity_ids = ensure_list(entity_ids)
|
|
110
140
|
if len(entity_ids) == 1:
|
|
@@ -113,25 +143,42 @@ class Fragments(object):
|
|
|
113
143
|
stmt = stmt.where(self.table.c.id.in_(entity_ids))
|
|
114
144
|
if fragment is not None:
|
|
115
145
|
stmt = stmt.where(self.table.c.fragment == fragment)
|
|
146
|
+
if schema is not None:
|
|
147
|
+
if self.store.is_postgres:
|
|
148
|
+
stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
|
|
149
|
+
else:
|
|
150
|
+
# SQLite JSON support - use json_extract function
|
|
151
|
+
stmt = stmt.where(
|
|
152
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
153
|
+
)
|
|
154
|
+
if since is not None:
|
|
155
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
156
|
+
if until is not None:
|
|
157
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
116
158
|
stmt = stmt.order_by(self.table.c.id)
|
|
117
159
|
# stmt = stmt.order_by(self.table.c.origin)
|
|
118
160
|
# stmt = stmt.order_by(self.table.c.fragment)
|
|
119
161
|
conn = self.store.engine.connect()
|
|
120
162
|
try:
|
|
121
|
-
conn
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
163
|
+
with disable_timeout(conn, self.store) as conn:
|
|
164
|
+
conn = conn.execution_options(stream_results=True)
|
|
165
|
+
for ent in conn.execute(stmt):
|
|
166
|
+
data = {"id": ent.id, "datasets": [self.name], **ent.entity}
|
|
167
|
+
if ent.origin != NULL_ORIGIN:
|
|
168
|
+
data["origin"] = ent.origin
|
|
169
|
+
yield data
|
|
127
170
|
except Exception:
|
|
128
171
|
self.reset()
|
|
129
172
|
raise
|
|
130
173
|
finally:
|
|
131
174
|
conn.close()
|
|
132
175
|
|
|
133
|
-
def partials(
|
|
134
|
-
|
|
176
|
+
def partials(
|
|
177
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
178
|
+
) -> EntityFragments:
|
|
179
|
+
for fragment in self.fragments(
|
|
180
|
+
entity_ids=entity_id, schema=schema, since=since, until=until
|
|
181
|
+
):
|
|
135
182
|
try:
|
|
136
183
|
yield EntityProxy.from_dict(fragment, cleaned=True)
|
|
137
184
|
except Exception:
|
|
@@ -140,18 +187,32 @@ class Fragments(object):
|
|
|
140
187
|
continue
|
|
141
188
|
raise
|
|
142
189
|
|
|
143
|
-
def iterate(
|
|
190
|
+
def iterate(
|
|
191
|
+
self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
|
|
192
|
+
) -> EntityFragments:
|
|
193
|
+
if entity_id is None:
|
|
194
|
+
log.info("Using batched iteration for complete dataset.")
|
|
195
|
+
yield from self.iterate_batched(
|
|
196
|
+
skip_errors=skip_errors, schema=schema, since=since, until=until
|
|
197
|
+
)
|
|
198
|
+
return
|
|
144
199
|
entity = None
|
|
145
200
|
invalid = None
|
|
146
201
|
fragments = 1
|
|
147
|
-
for partial in self.partials(
|
|
202
|
+
for partial in self.partials(
|
|
203
|
+
entity_id=entity_id,
|
|
204
|
+
skip_errors=skip_errors,
|
|
205
|
+
schema=schema,
|
|
206
|
+
since=since,
|
|
207
|
+
until=until,
|
|
208
|
+
):
|
|
148
209
|
if partial.id == invalid:
|
|
149
210
|
continue
|
|
150
211
|
if entity is not None:
|
|
151
212
|
if entity.id == partial.id:
|
|
152
213
|
fragments += 1
|
|
153
214
|
if fragments % 10000 == 0:
|
|
154
|
-
log.
|
|
215
|
+
log.warning(
|
|
155
216
|
"[%s:%s] aggregated %d fragments...",
|
|
156
217
|
entity.schema.name,
|
|
157
218
|
entity.id,
|
|
@@ -176,13 +237,78 @@ class Fragments(object):
|
|
|
176
237
|
if entity is not None:
|
|
177
238
|
yield entity
|
|
178
239
|
|
|
179
|
-
def
|
|
240
|
+
def iterate_batched(
|
|
241
|
+
self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
|
|
242
|
+
) -> EntityFragments:
|
|
243
|
+
"""
|
|
244
|
+
For large datasets an overall sort is not feasible, so we iterate in
|
|
245
|
+
sorted batched IDs.
|
|
246
|
+
"""
|
|
247
|
+
for entity_ids in self.get_sorted_id_batches(
|
|
248
|
+
batch_size, schema=schema, since=since, until=until
|
|
249
|
+
):
|
|
250
|
+
yield from self.iterate(
|
|
251
|
+
entity_id=entity_ids,
|
|
252
|
+
skip_errors=skip_errors,
|
|
253
|
+
schema=schema,
|
|
254
|
+
since=since,
|
|
255
|
+
until=until,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def get_sorted_id_batches(
|
|
259
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
260
|
+
) -> Generator[list[str], None, None]:
|
|
261
|
+
"""
|
|
262
|
+
Get sorted ID batches to speed up iteration and useful to parallelize
|
|
263
|
+
processing of iterator Entities
|
|
264
|
+
"""
|
|
265
|
+
last_id = None
|
|
266
|
+
with self.store.engine.connect() as conn:
|
|
267
|
+
while True:
|
|
268
|
+
stmt = select(self.table.c.id).distinct()
|
|
269
|
+
if last_id is not None:
|
|
270
|
+
stmt = stmt.where(self.table.c.id > last_id)
|
|
271
|
+
if schema is not None:
|
|
272
|
+
if self.store.is_postgres:
|
|
273
|
+
stmt = stmt.where(
|
|
274
|
+
self.table.c.entity["schema"].astext == schema
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# SQLite JSON support - use json_extract function
|
|
278
|
+
stmt = stmt.where(
|
|
279
|
+
func.json_extract(self.table.c.entity, "$.schema") == schema
|
|
280
|
+
)
|
|
281
|
+
if since is not None:
|
|
282
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
283
|
+
if until is not None:
|
|
284
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
285
|
+
stmt = stmt.order_by(self.table.c.id).limit(batch_size)
|
|
286
|
+
try:
|
|
287
|
+
res = conn.execute(stmt)
|
|
288
|
+
entity_ids = [r.id for r in res.fetchall()]
|
|
289
|
+
if not entity_ids:
|
|
290
|
+
return
|
|
291
|
+
yield entity_ids
|
|
292
|
+
last_id = entity_ids[-1]
|
|
293
|
+
except Exception:
|
|
294
|
+
self.reset()
|
|
295
|
+
raise
|
|
296
|
+
|
|
297
|
+
def get_sorted_ids(
|
|
298
|
+
self, batch_size=10_000, schema=None, since=None, until=None
|
|
299
|
+
) -> Generator[str, None, None]:
|
|
300
|
+
"""Get sorted IDs, optionally filtered by schema"""
|
|
301
|
+
for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
|
|
302
|
+
yield from batch
|
|
303
|
+
|
|
304
|
+
def statements(
|
|
180
305
|
self,
|
|
181
306
|
entity_ids: Iterable[str] | None = None,
|
|
182
307
|
origin: str | None = None,
|
|
183
308
|
since: datetime | None = None,
|
|
184
|
-
|
|
185
|
-
|
|
309
|
+
until: datetime | None = None,
|
|
310
|
+
) -> Statements:
|
|
311
|
+
"""Iterate unsorted statements with its fragment origins"""
|
|
186
312
|
stmt = self.table.select()
|
|
187
313
|
entity_ids = ensure_list(entity_ids)
|
|
188
314
|
if len(entity_ids) == 1:
|
|
@@ -192,7 +318,9 @@ class Fragments(object):
|
|
|
192
318
|
if origin is not None:
|
|
193
319
|
stmt = stmt.where(self.table.c.origin == origin)
|
|
194
320
|
if since is not None:
|
|
195
|
-
stmt = stmt.where(self.table.c.timestamp
|
|
321
|
+
stmt = stmt.where(self.table.c.timestamp >= since)
|
|
322
|
+
if until is not None:
|
|
323
|
+
stmt = stmt.where(self.table.c.timestamp <= until)
|
|
196
324
|
conn = self.store.engine.connect()
|
|
197
325
|
default_dataset = make_dataset(self.name)
|
|
198
326
|
try:
|
|
@@ -204,8 +332,10 @@ class Fragments(object):
|
|
|
204
332
|
)
|
|
205
333
|
for statement in entity.statements:
|
|
206
334
|
statement.last_seen = fragment.timestamp.isoformat()
|
|
207
|
-
origin =
|
|
208
|
-
|
|
335
|
+
statement.origin = (
|
|
336
|
+
fragment.origin if fragment.origin != NULL_ORIGIN else None
|
|
337
|
+
)
|
|
338
|
+
yield statement
|
|
209
339
|
except Exception:
|
|
210
340
|
self.reset()
|
|
211
341
|
raise
|
|
@@ -26,11 +26,16 @@ EXCEPTIONS = (
|
|
|
26
26
|
TimeoutError,
|
|
27
27
|
)
|
|
28
28
|
try:
|
|
29
|
-
from
|
|
29
|
+
from psycopg import DatabaseError, OperationalError
|
|
30
30
|
|
|
31
31
|
EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
|
|
32
32
|
except ImportError:
|
|
33
|
-
|
|
33
|
+
try:
|
|
34
|
+
from psycopg2 import DatabaseError, OperationalError
|
|
35
|
+
|
|
36
|
+
EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
|
|
37
|
+
except ImportError:
|
|
38
|
+
pass
|
|
34
39
|
|
|
35
40
|
log = logging.getLogger(__name__)
|
|
36
41
|
|
|
@@ -50,9 +55,12 @@ class BulkLoader(object):
|
|
|
50
55
|
else:
|
|
51
56
|
entity = dict(entity)
|
|
52
57
|
id_ = entity.pop("id")
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
self.
|
|
58
|
+
if id_:
|
|
59
|
+
self.buffer[(id_, origin, fragment)] = entity
|
|
60
|
+
if len(self.buffer) >= self.size:
|
|
61
|
+
self.flush()
|
|
62
|
+
else:
|
|
63
|
+
log.warning("Entity has no ID!")
|
|
56
64
|
|
|
57
65
|
def _store_values(self, conn, values):
|
|
58
66
|
table = self.dataset.table
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from sqlalchemy import MetaData, create_engine
|
|
2
|
+
from sqlalchemy import inspect as sqlalchemy_inspect
|
|
3
|
+
|
|
4
|
+
from ftmq.store.fragments.dataset import Fragments
|
|
5
|
+
from ftmq.store.fragments.utils import NULL_ORIGIN
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Store(object):
|
|
9
|
+
"""A database containing multiple tables that represent
|
|
10
|
+
FtM-store datasets."""
|
|
11
|
+
|
|
12
|
+
PREFIX = "ftm"
|
|
13
|
+
|
|
14
|
+
def _adjust_psycopg3_uri(self, database_uri: str) -> str:
|
|
15
|
+
"""Adjust PostgreSQL URI to use psycopg3 dialect if psycopg is available."""
|
|
16
|
+
if database_uri.startswith(("postgresql://", "postgres://")):
|
|
17
|
+
try:
|
|
18
|
+
import psycopg # noqa: F401
|
|
19
|
+
|
|
20
|
+
# Use psycopg3 dialect for better performance and compatibility
|
|
21
|
+
if database_uri.startswith("postgresql://"):
|
|
22
|
+
return database_uri.replace(
|
|
23
|
+
"postgresql://", "postgresql+psycopg://", 1
|
|
24
|
+
)
|
|
25
|
+
elif database_uri.startswith("postgres://"):
|
|
26
|
+
return database_uri.replace(
|
|
27
|
+
"postgres://", "postgresql+psycopg://", 1
|
|
28
|
+
)
|
|
29
|
+
except ImportError:
|
|
30
|
+
# Fall back to psycopg2 if psycopg3 is not available
|
|
31
|
+
pass
|
|
32
|
+
return database_uri
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
database_uri: str,
|
|
37
|
+
**config,
|
|
38
|
+
):
|
|
39
|
+
self.database_uri = self._adjust_psycopg3_uri(database_uri)
|
|
40
|
+
|
|
41
|
+
# Configure connection pooling for psycopg3
|
|
42
|
+
config.setdefault("pool_size", 1)
|
|
43
|
+
if self.database_uri.startswith("postgresql+psycopg://"):
|
|
44
|
+
config.setdefault("max_overflow", 5)
|
|
45
|
+
config.setdefault("pool_timeout", 60)
|
|
46
|
+
config.setdefault("pool_recycle", 3600)
|
|
47
|
+
config.setdefault("pool_pre_ping", True)
|
|
48
|
+
|
|
49
|
+
self.engine = create_engine(self.database_uri, future=True, **config)
|
|
50
|
+
self.is_postgres = self.engine.dialect.name == "postgresql"
|
|
51
|
+
self.meta = MetaData()
|
|
52
|
+
|
|
53
|
+
def get(self, name, origin=NULL_ORIGIN):
|
|
54
|
+
return Fragments(self, name, origin=origin)
|
|
55
|
+
|
|
56
|
+
def all(self, origin=NULL_ORIGIN):
|
|
57
|
+
prefix = f"{self.PREFIX}_"
|
|
58
|
+
inspect = sqlalchemy_inspect(self.engine)
|
|
59
|
+
for table in inspect.get_table_names():
|
|
60
|
+
if table.startswith(prefix):
|
|
61
|
+
name = table[len(prefix) :]
|
|
62
|
+
yield Fragments(self, name, origin=origin)
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
self.engine.dispose()
|
|
66
|
+
|
|
67
|
+
def __len__(self):
|
|
68
|
+
return len(list(self.all()))
|
|
69
|
+
|
|
70
|
+
def __repr__(self):
|
|
71
|
+
return "<Store(%r)>" % self.engine
|
|
@@ -18,13 +18,14 @@ Layout:
|
|
|
18
18
|
```
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
from functools import cache
|
|
22
21
|
from pathlib import Path
|
|
23
22
|
from typing import Any, Generator, Iterable
|
|
23
|
+
from urllib.parse import urlparse
|
|
24
24
|
|
|
25
25
|
import duckdb
|
|
26
26
|
import numpy as np
|
|
27
27
|
import pandas as pd
|
|
28
|
+
from anystore.functools import weakref_cache as cache
|
|
28
29
|
from anystore.lock import Lock
|
|
29
30
|
from anystore.logging import get_logger
|
|
30
31
|
from anystore.store.fs import Store as FSStore
|
|
@@ -52,26 +53,35 @@ from sqlalchemy.sql import Select
|
|
|
52
53
|
from ftmq.query import Query
|
|
53
54
|
from ftmq.store.base import Store
|
|
54
55
|
from ftmq.store.sql import SQLQueryView, SQLStore
|
|
55
|
-
from ftmq.types import
|
|
56
|
-
from ftmq.util import ensure_entity, get_scope_dataset
|
|
56
|
+
from ftmq.types import StatementEntities
|
|
57
|
+
from ftmq.util import apply_dataset, ensure_entity, get_scope_dataset
|
|
57
58
|
|
|
58
59
|
log = get_logger(__name__)
|
|
59
60
|
|
|
60
|
-
Z_ORDER = ["canonical_id", "
|
|
61
|
+
Z_ORDER = ["canonical_id", "entity_id", "schema", "prop"]
|
|
62
|
+
TARGET_SIZE = 50 * 10_485_760 # 500 MB
|
|
61
63
|
PARTITION_BY = ["dataset", "bucket", "origin"]
|
|
62
64
|
DEFAULT_ORIGIN = "default"
|
|
63
65
|
BUCKET_DOCUMENT = "document"
|
|
64
66
|
BUCKET_INTERVAL = "interval"
|
|
65
67
|
BUCKET_THING = "thing"
|
|
66
|
-
|
|
68
|
+
STATISTICS_BLOOM = ColumnProperties(
|
|
69
|
+
bloom_filter_properties=BloomFilterProperties(True),
|
|
70
|
+
statistics_enabled="CHUNK",
|
|
71
|
+
dictionary_enabled=True,
|
|
72
|
+
)
|
|
73
|
+
STATISTICS = ColumnProperties(statistics_enabled="CHUNK", dictionary_enabled=True)
|
|
67
74
|
WRITER = WriterProperties(
|
|
75
|
+
data_page_size_limit=64 * 1024,
|
|
76
|
+
dictionary_page_size_limit=512 * 1024,
|
|
77
|
+
max_row_group_size=500_000,
|
|
68
78
|
compression="SNAPPY",
|
|
69
79
|
column_properties={
|
|
70
|
-
"canonical_id":
|
|
71
|
-
"entity_id":
|
|
72
|
-
"schema":
|
|
73
|
-
"prop":
|
|
74
|
-
"value":
|
|
80
|
+
"canonical_id": STATISTICS,
|
|
81
|
+
"entity_id": STATISTICS,
|
|
82
|
+
"schema": STATISTICS,
|
|
83
|
+
"prop": STATISTICS_BLOOM,
|
|
84
|
+
"value": STATISTICS_BLOOM,
|
|
75
85
|
},
|
|
76
86
|
)
|
|
77
87
|
|
|
@@ -111,6 +121,13 @@ class StorageSettings(BaseSettings):
|
|
|
111
121
|
return not self.endpoint.startswith("https")
|
|
112
122
|
return False
|
|
113
123
|
|
|
124
|
+
@property
|
|
125
|
+
def duckdb_endpoint(self) -> str | None:
|
|
126
|
+
if not self.endpoint:
|
|
127
|
+
return
|
|
128
|
+
scheme = urlparse(self.endpoint).scheme
|
|
129
|
+
return self.endpoint[len(scheme) + len("://") :]
|
|
130
|
+
|
|
114
131
|
|
|
115
132
|
storage_settings = StorageSettings()
|
|
116
133
|
|
|
@@ -154,16 +171,15 @@ def get_schema_bucket(schema_name: str) -> str:
|
|
|
154
171
|
return BUCKET_THING
|
|
155
172
|
|
|
156
173
|
|
|
157
|
-
def pack_statement(stmt: Statement
|
|
174
|
+
def pack_statement(stmt: Statement) -> SDict:
|
|
158
175
|
data = stmt.to_db_row()
|
|
159
|
-
data["origin"] = origin
|
|
160
176
|
data["bucket"] = get_schema_bucket(data["schema"])
|
|
161
177
|
return data
|
|
162
178
|
|
|
163
179
|
|
|
164
|
-
def pack_statements(statements: Iterable[
|
|
165
|
-
df = pd.DataFrame(pack_statement
|
|
166
|
-
df = df.drop_duplicates().sort_values(Z_ORDER)
|
|
180
|
+
def pack_statements(statements: Iterable[Statement]) -> pd.DataFrame:
|
|
181
|
+
df = pd.DataFrame(map(pack_statement, statements))
|
|
182
|
+
df = df.drop_duplicates() # .sort_values(Z_ORDER)
|
|
167
183
|
df = df.fillna(np.nan)
|
|
168
184
|
return df
|
|
169
185
|
|
|
@@ -213,9 +229,10 @@ def ensure_schema_buckets(q: Query) -> Select:
|
|
|
213
229
|
class LakeQueryView(SQLQueryView):
|
|
214
230
|
def query(self, query: Query | None = None) -> StatementEntities:
|
|
215
231
|
if query:
|
|
232
|
+
query.table = self.store.table
|
|
216
233
|
query = self.ensure_scoped_query(query)
|
|
217
|
-
|
|
218
|
-
yield from self.store._iterate(
|
|
234
|
+
sql = ensure_schema_buckets(query)
|
|
235
|
+
yield from self.store._iterate(sql)
|
|
219
236
|
else:
|
|
220
237
|
yield from super().query(query)
|
|
221
238
|
|
|
@@ -225,6 +242,7 @@ class LakeStore(SQLStore):
|
|
|
225
242
|
self._backend: FSStore = FSStore(uri=kwargs.pop("uri"))
|
|
226
243
|
self._partition_by = kwargs.pop("partition_by", PARTITION_BY)
|
|
227
244
|
self._lock: Lock = kwargs.pop("lock", Lock(self._backend))
|
|
245
|
+
self._enforce_dataset = kwargs.pop("enforce_dataset", False)
|
|
228
246
|
assert isinstance(
|
|
229
247
|
self._backend, FSStore
|
|
230
248
|
), f"Invalid store backend: `{self._backend.__class__}"
|
|
@@ -235,12 +253,13 @@ class LakeStore(SQLStore):
|
|
|
235
253
|
self.uri = self._backend.uri
|
|
236
254
|
setup_duckdb_storage()
|
|
237
255
|
|
|
238
|
-
|
|
256
|
+
@property
|
|
257
|
+
def deltatable(self) -> DeltaTable:
|
|
239
258
|
return DeltaTable(self.uri, storage_options=storage_options())
|
|
240
259
|
|
|
241
260
|
def _execute(self, q: Select, stream: bool = True) -> Generator[Any, None, None]:
|
|
242
261
|
try:
|
|
243
|
-
yield from stream_duckdb(q, self.
|
|
262
|
+
yield from stream_duckdb(q, self.deltatable)
|
|
244
263
|
except TableNotFoundError:
|
|
245
264
|
pass
|
|
246
265
|
|
|
@@ -265,7 +284,7 @@ class LakeStore(SQLStore):
|
|
|
265
284
|
|
|
266
285
|
def get_origins(self) -> set[str]:
|
|
267
286
|
q = select(self.table.c.origin).distinct()
|
|
268
|
-
return set([r.origin for r in stream_duckdb(q, self.
|
|
287
|
+
return set([r.origin for r in stream_duckdb(q, self.deltatable)])
|
|
269
288
|
|
|
270
289
|
|
|
271
290
|
class LakeWriter(nk.Writer):
|
|
@@ -274,21 +293,27 @@ class LakeWriter(nk.Writer):
|
|
|
274
293
|
|
|
275
294
|
def __init__(self, store: Store, origin: str | None = DEFAULT_ORIGIN):
|
|
276
295
|
super().__init__(store)
|
|
277
|
-
self.batch: set[
|
|
296
|
+
self.batch: set[Statement] = set()
|
|
278
297
|
self.origin = origin or DEFAULT_ORIGIN
|
|
279
298
|
|
|
280
|
-
def add_statement(self, stmt: Statement
|
|
299
|
+
def add_statement(self, stmt: Statement) -> None:
|
|
281
300
|
if stmt.entity_id is None:
|
|
282
301
|
return
|
|
283
|
-
origin = origin or self.origin
|
|
302
|
+
stmt.origin = stmt.origin or self.origin
|
|
284
303
|
canonical_id = self.store.linker.get_canonical(stmt.entity_id)
|
|
285
304
|
stmt.canonical_id = canonical_id
|
|
286
|
-
self.batch.add(
|
|
305
|
+
self.batch.add(stmt)
|
|
287
306
|
|
|
288
307
|
def add_entity(self, entity: EntityProxy, origin: str | None = None) -> None:
|
|
289
308
|
e = ensure_entity(entity, StatementEntity, self.store.dataset)
|
|
309
|
+
if self.store._enforce_dataset:
|
|
310
|
+
e = apply_dataset(e, self.store.dataset, replace=True)
|
|
290
311
|
for stmt in e.statements:
|
|
291
|
-
|
|
312
|
+
if origin:
|
|
313
|
+
stmt.origin = origin
|
|
314
|
+
self.add_statement(stmt)
|
|
315
|
+
# we check here instead of in `add_statement` as this will keep entities
|
|
316
|
+
# together in the same parquet files`
|
|
292
317
|
if len(self.batch) >= self.BATCH_STATEMENTS:
|
|
293
318
|
self.flush()
|
|
294
319
|
|
|
@@ -304,8 +329,10 @@ class LakeWriter(nk.Writer):
|
|
|
304
329
|
pack_statements(self.batch),
|
|
305
330
|
partition_by=self.store._partition_by,
|
|
306
331
|
mode="append",
|
|
307
|
-
writer_properties=WRITER,
|
|
308
332
|
schema_mode="merge",
|
|
333
|
+
writer_properties=WRITER,
|
|
334
|
+
target_file_size=TARGET_SIZE,
|
|
335
|
+
storage_options=storage_options(),
|
|
309
336
|
)
|
|
310
337
|
|
|
311
338
|
self.batch = set()
|
|
@@ -317,8 +344,7 @@ class LakeWriter(nk.Writer):
|
|
|
317
344
|
for row in self.store._execute(q):
|
|
318
345
|
statements.append(Statement.from_db_row(row))
|
|
319
346
|
|
|
320
|
-
|
|
321
|
-
table.delete(f"canonical_id = '{entity_id}'")
|
|
347
|
+
self.store.deltatable.delete(f"canonical_id = '{entity_id}'")
|
|
322
348
|
return statements
|
|
323
349
|
|
|
324
350
|
def optimize(
|
|
@@ -327,10 +353,11 @@ class LakeWriter(nk.Writer):
|
|
|
327
353
|
"""
|
|
328
354
|
Optimize the storage: Z-Ordering and compacting
|
|
329
355
|
"""
|
|
330
|
-
|
|
331
|
-
|
|
356
|
+
self.store.deltatable.optimize.z_order(
|
|
357
|
+
Z_ORDER, writer_properties=WRITER, target_size=TARGET_SIZE
|
|
358
|
+
)
|
|
332
359
|
if vacuum:
|
|
333
|
-
|
|
360
|
+
self.store.deltatable.vacuum(
|
|
334
361
|
retention_hours=vacuum_keep_hours,
|
|
335
362
|
enforce_retention_duration=False,
|
|
336
363
|
dry_run=False,
|
|
@@ -20,7 +20,3 @@ ValueEntities: TypeAlias = Generator[ValueEntity, None, None]
|
|
|
20
20
|
# statements
|
|
21
21
|
Statements: TypeAlias = Generator[Statement, None, None]
|
|
22
22
|
"""A generator for Statement instances"""
|
|
23
|
-
OriginStatement: TypeAlias = tuple[Statement, str | None]
|
|
24
|
-
"""A statement with its origin"""
|
|
25
|
-
OriginStatements: TypeAlias = Generator[OriginStatement, None, None]
|
|
26
|
-
"""A generator for OriginStatement instances"""
|
|
@@ -1,17 +1,22 @@
|
|
|
1
|
-
from functools import
|
|
1
|
+
from functools import lru_cache
|
|
2
2
|
from typing import Any, Generator, Type
|
|
3
3
|
|
|
4
4
|
import pycountry
|
|
5
|
-
from anystore.
|
|
5
|
+
from anystore.functools import weakref_cache as cache
|
|
6
|
+
from anystore.types import SDict, StrGenerator
|
|
6
7
|
from banal import ensure_list, is_listish
|
|
7
|
-
from followthemoney import E
|
|
8
|
+
from followthemoney import E, model
|
|
9
|
+
from followthemoney.compare import _normalize_names
|
|
8
10
|
from followthemoney.dataset import Dataset
|
|
9
11
|
from followthemoney.entity import ValueEntity
|
|
10
12
|
from followthemoney.proxy import EntityProxy
|
|
11
13
|
from followthemoney.schema import Schema
|
|
12
14
|
from followthemoney.types import registry
|
|
13
15
|
from followthemoney.util import make_entity_id, sanitize_text
|
|
14
|
-
from normality import collapse_spaces, slugify
|
|
16
|
+
from normality import collapse_spaces, latinize_text, slugify
|
|
17
|
+
from rigour.names import Name, Symbol, tag_org_name, tag_person_name
|
|
18
|
+
from rigour.names.tokenize import normalize_name
|
|
19
|
+
from rigour.text.scripts import can_latinize
|
|
15
20
|
|
|
16
21
|
from ftmq.enums import Comparators
|
|
17
22
|
from ftmq.types import Entity
|
|
@@ -91,6 +96,8 @@ def make_entity(
|
|
|
91
96
|
etype = entity_type or ValueEntity
|
|
92
97
|
if data.get("id") is None:
|
|
93
98
|
raise ValueError("Entity has no ID.")
|
|
99
|
+
if etype == EntityProxy:
|
|
100
|
+
return EntityProxy.from_dict(data)
|
|
94
101
|
if etype == ValueEntity:
|
|
95
102
|
if not data.get("datasets"):
|
|
96
103
|
dataset = make_dataset(default_dataset).name
|
|
@@ -373,6 +380,24 @@ def make_fingerprint(value: Any) -> str | None:
|
|
|
373
380
|
return " ".join(sorted(set(slugify(value).split("-"))))
|
|
374
381
|
|
|
375
382
|
|
|
383
|
+
def entity_fingerprints(entity: EntityProxy) -> set[str]:
|
|
384
|
+
"""Get the set of entity name fingerprints, latinized if the alphabet allows
|
|
385
|
+
it and with org / person tags removed depending on entity schema"""
|
|
386
|
+
return make_fingerprints(*entity.names, schemata={entity.schema})
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def make_fingerprints(*names: str, schemata: set[Schema] | None = None) -> set[str]:
|
|
390
|
+
"""Get the set of name fingerprints, latinized if the alphabet allows
|
|
391
|
+
it and with org / person tags removed depending on given schemata"""
|
|
392
|
+
# FIXME private import
|
|
393
|
+
schemata = schemata or {model["LegalEntity"]}
|
|
394
|
+
fps: set[str] = set()
|
|
395
|
+
for schema in schemata:
|
|
396
|
+
fps.update(set(_normalize_names(schema, names)))
|
|
397
|
+
# add latinized if appropriate
|
|
398
|
+
return {latinize_text(fp) if can_latinize(fp) else fp for fp in fps}
|
|
399
|
+
|
|
400
|
+
|
|
376
401
|
def make_string_id(*values: Any) -> str | None:
|
|
377
402
|
"""
|
|
378
403
|
Compute a hash id based on values
|
|
@@ -458,3 +483,64 @@ def must_str(value: Any) -> str:
|
|
|
458
483
|
if not value:
|
|
459
484
|
raise ValueError(f"Value invalid: `{value}`")
|
|
460
485
|
return value
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
SELECT_SYMBOLS = "__symbols__"
|
|
489
|
+
SELECT_ANNOTATED = "__annotated__"
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def get_name_symbols(schema: Schema, *names: str) -> set[Symbol]:
|
|
493
|
+
"""Get the rigour names symbols for the given schema and list of names"""
|
|
494
|
+
symbols: set[Symbol] = set()
|
|
495
|
+
if schema.is_a("Person"):
|
|
496
|
+
taggers = [tag_person_name]
|
|
497
|
+
elif schema.is_a("Organization"):
|
|
498
|
+
taggers = [tag_org_name]
|
|
499
|
+
elif schema.is_a("LegalEntity"):
|
|
500
|
+
taggers = [tag_org_name, tag_person_name]
|
|
501
|
+
else:
|
|
502
|
+
return symbols
|
|
503
|
+
for name in names:
|
|
504
|
+
n = Name(name)
|
|
505
|
+
for tagger in taggers:
|
|
506
|
+
for symbol in tagger(n, normalize_name).symbols:
|
|
507
|
+
symbols.add(symbol)
|
|
508
|
+
return symbols
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def get_symbols(entity: EntityProxy) -> set[Symbol]:
|
|
512
|
+
"""Get the rigour names symbols for the given entity"""
|
|
513
|
+
if not entity.schema.is_a("LegalEntity"):
|
|
514
|
+
return set()
|
|
515
|
+
names = entity.get_type_values(registry.name, matchable=True)
|
|
516
|
+
return get_name_symbols(entity.schema, *names)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def inline_symbols(entity: EntityProxy) -> None:
|
|
520
|
+
"""Get the rigour names symbols for the given entity and write them to `indexText`"""
|
|
521
|
+
# clean up old symbols from indexText:
|
|
522
|
+
for text in entity.pop("indexText"):
|
|
523
|
+
if not text.startswith(SELECT_SYMBOLS):
|
|
524
|
+
entity.add("indexText", text)
|
|
525
|
+
symbols = get_symbols(entity)
|
|
526
|
+
entity.add("indexText", f"{SELECT_SYMBOLS} {','.join(map(str, symbols))}")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def select_data(e: EntityProxy, prefix: str) -> StrGenerator:
|
|
530
|
+
"""Select arbitrary stored data in `indexText` identified by given prefix"""
|
|
531
|
+
for text in e.get("indexText", quiet=True):
|
|
532
|
+
if text.startswith(prefix):
|
|
533
|
+
yield text.replace(prefix, "").strip()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def select_symbols(e: EntityProxy) -> set[str]:
|
|
537
|
+
"""Select stored symbols in `indexText`"""
|
|
538
|
+
symbols: set[str] = set()
|
|
539
|
+
for data in select_data(e, SELECT_SYMBOLS):
|
|
540
|
+
symbols.update(data.split(","))
|
|
541
|
+
return symbols
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def select_annotations(e: EntityProxy) -> set[str]:
|
|
545
|
+
"""Select stored annotations in `indexText`"""
|
|
546
|
+
return {s for s in select_data(e, SELECT_ANNOTATED)}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ftmq"
|
|
3
|
-
version = "4.1
|
|
3
|
+
version = "4.3.1"
|
|
4
4
|
description = "followthemoney query dsl and io helpers"
|
|
5
5
|
authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
|
|
6
6
|
license = "AGPLv3+"
|
|
@@ -15,9 +15,10 @@ classifiers = [
|
|
|
15
15
|
]
|
|
16
16
|
requires-python = ">=3.11,<4"
|
|
17
17
|
dependencies = [
|
|
18
|
-
"anystore (>=0.
|
|
19
|
-
"followthemoney (>=4.
|
|
20
|
-
"nomenklatura (>=4.1.
|
|
18
|
+
"anystore (>=0.4.0,<0.5.0)",
|
|
19
|
+
"followthemoney (>=4.3.2,<5.0.0)",
|
|
20
|
+
"nomenklatura (>=4.1.10,<5.0.0)",
|
|
21
|
+
"rigour (>=1.4.0,<2.0.0)",
|
|
21
22
|
"click (>=8.2.1,<9.0.0)",
|
|
22
23
|
"click-default-group (>=1.2.4,<2.0.0)",
|
|
23
24
|
"orjson (>=3.10.18,<4.0.0)",
|
|
@@ -29,12 +30,12 @@ dependencies = [
|
|
|
29
30
|
[project.optional-dependencies]
|
|
30
31
|
level = ["plyvel (>=1.5.1,<2.0.0)"]
|
|
31
32
|
sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
|
|
32
|
-
postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "
|
|
33
|
+
postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
|
|
33
34
|
redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
|
|
34
35
|
lake = [
|
|
35
|
-
"duckdb (>=1.
|
|
36
|
-
"pandas (>=2.3.
|
|
37
|
-
"deltalake (>=1.1
|
|
36
|
+
"duckdb (>=1.4.1,<2.0.0)",
|
|
37
|
+
"pandas (>=2.3.3,<3.0.0)",
|
|
38
|
+
"deltalake (>=1.2.1,<2.0.0)",
|
|
38
39
|
"pyarrow (>=21.0.0,<22.0.0)",
|
|
39
40
|
]
|
|
40
41
|
aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
|
|
@@ -50,19 +51,19 @@ Issues = "https://github.com/dataresearchcenter/ftmq/issues"
|
|
|
50
51
|
|
|
51
52
|
[tool.poetry.group.dev.dependencies]
|
|
52
53
|
pytest = ">=7.4.3,<9.0.0"
|
|
53
|
-
pytest-cov = ">=4.1,<
|
|
54
|
+
pytest-cov = ">=4.1,<8.0"
|
|
54
55
|
pytest-env = "^1.1.1"
|
|
55
56
|
black = ">=23.11,<26.0"
|
|
56
|
-
isort = "^
|
|
57
|
-
mypy = "^1.17.
|
|
57
|
+
isort = "^7.0.0"
|
|
58
|
+
mypy = "^1.17.1"
|
|
58
59
|
pre-commit = "^4.0.1"
|
|
59
60
|
flake8 = ">=6.1,<8.0"
|
|
60
61
|
ipdb = "^0.13.13"
|
|
61
62
|
bump2version = "^1.0.1"
|
|
62
63
|
mkdocs = "^1.6.1"
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
mkdocs-material = "^9.6.
|
|
64
|
+
mkdocs-autorefs = "^1.4.3"
|
|
65
|
+
mkdocstrings-python = "^1.18.2"
|
|
66
|
+
mkdocs-material = "^9.6.18"
|
|
66
67
|
mkdocs-click = "^0.9.0"
|
|
67
68
|
|
|
68
69
|
[build-system]
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from sqlalchemy import MetaData, create_engine
|
|
2
|
-
from sqlalchemy import inspect as sqlalchemy_inspect
|
|
3
|
-
|
|
4
|
-
from ftmq.store.fragments.dataset import Fragments
|
|
5
|
-
from ftmq.store.fragments.utils import NULL_ORIGIN
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Store(object):
|
|
9
|
-
"""A database containing multiple tables that represent
|
|
10
|
-
FtM-store datasets."""
|
|
11
|
-
|
|
12
|
-
PREFIX = "ftm"
|
|
13
|
-
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
database_uri: str,
|
|
17
|
-
**config,
|
|
18
|
-
):
|
|
19
|
-
self.database_uri = database_uri
|
|
20
|
-
# config.setdefault('pool_size', 1)
|
|
21
|
-
self.engine = create_engine(database_uri, future=True, **config)
|
|
22
|
-
self.is_postgres = self.engine.dialect.name == "postgresql"
|
|
23
|
-
self.meta = MetaData()
|
|
24
|
-
|
|
25
|
-
def get(self, name, origin=NULL_ORIGIN):
|
|
26
|
-
return Fragments(self, name, origin=origin)
|
|
27
|
-
|
|
28
|
-
def all(self, origin=NULL_ORIGIN):
|
|
29
|
-
prefix = f"{self.PREFIX}_"
|
|
30
|
-
inspect = sqlalchemy_inspect(self.engine)
|
|
31
|
-
for table in inspect.get_table_names():
|
|
32
|
-
if table.startswith(prefix):
|
|
33
|
-
name = table[len(prefix) :]
|
|
34
|
-
yield Fragments(self, name, origin=origin)
|
|
35
|
-
|
|
36
|
-
def close(self):
|
|
37
|
-
self.engine.dispose()
|
|
38
|
-
|
|
39
|
-
def __len__(self):
|
|
40
|
-
return len(list(self.all()))
|
|
41
|
-
|
|
42
|
-
def __repr__(self):
|
|
43
|
-
return "<Store(%r)>" % self.engine
|
|
File without changes
|
{ftmq-4.1.0 → ftmq-4.3.1}/NOTICE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|