ftmq 4.1.0__tar.gz → 4.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {ftmq-4.1.0 → ftmq-4.3.1}/PKG-INFO +12 -9
  2. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/__init__.py +1 -1
  3. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/cli.py +81 -0
  4. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/filters.py +16 -0
  5. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/io.py +0 -2
  6. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/dataset.py +1 -1
  7. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/query.py +21 -0
  8. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/sql.py +13 -3
  9. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/__init__.py +1 -1
  10. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/base.py +1 -1
  11. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/__init__.py +1 -1
  12. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/dataset.py +151 -21
  13. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/loader.py +13 -5
  14. ftmq-4.3.1/ftmq/store/fragments/store.py +71 -0
  15. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/lake.py +58 -31
  16. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/types.py +0 -4
  17. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/util.py +90 -4
  18. {ftmq-4.1.0 → ftmq-4.3.1}/pyproject.toml +15 -14
  19. ftmq-4.1.0/ftmq/store/fragments/store.py +0 -43
  20. {ftmq-4.1.0 → ftmq-4.3.1}/LICENSE +0 -0
  21. {ftmq-4.1.0 → ftmq-4.3.1}/NOTICE +0 -0
  22. {ftmq-4.1.0 → ftmq-4.3.1}/README.md +0 -0
  23. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/aggregate.py +0 -0
  24. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/aggregations.py +0 -0
  25. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/enums.py +0 -0
  26. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/logging.py +0 -0
  27. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/__init__.py +0 -0
  28. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/entity.py +0 -0
  29. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/mixins.py +0 -0
  30. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/model/stats.py +0 -0
  31. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/similar.py +0 -0
  32. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/aleph.py +0 -0
  33. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/settings.py +0 -0
  34. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/fragments/utils.py +0 -0
  35. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/level.py +0 -0
  36. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/memory.py +0 -0
  37. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/redis.py +0 -0
  38. {ftmq-4.1.0 → ftmq-4.3.1}/ftmq/store/sql.py +0 -0
@@ -1,8 +1,10 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: ftmq
3
- Version: 4.1.0
3
+ Version: 4.3.1
4
4
  Summary: followthemoney query dsl and io helpers
5
5
  License: AGPLv3+
6
+ License-File: LICENSE
7
+ License-File: NOTICE
6
8
  Author: Simon Wörpel
7
9
  Author-email: simon.woerpel@pm.me
8
10
  Requires-Python: >=3.11,<4
@@ -19,24 +21,25 @@ Provides-Extra: postgres
19
21
  Provides-Extra: redis
20
22
  Provides-Extra: sql
21
23
  Requires-Dist: alephclient (>=2.6.0,<3.0.0) ; extra == "aleph"
22
- Requires-Dist: anystore (>=0.3.9,<0.4.0)
24
+ Requires-Dist: anystore (>=0.4.0,<0.5.0)
23
25
  Requires-Dist: click (>=8.2.1,<9.0.0)
24
26
  Requires-Dist: click-default-group (>=1.2.4,<2.0.0)
25
- Requires-Dist: deltalake (>=1.1.2,<2.0.0) ; extra == "lake"
26
- Requires-Dist: duckdb (>=1.3.2,<2.0.0) ; extra == "lake"
27
+ Requires-Dist: deltalake (>=1.2.1,<2.0.0) ; extra == "lake"
28
+ Requires-Dist: duckdb (>=1.4.1,<2.0.0) ; extra == "lake"
27
29
  Requires-Dist: fakeredis (>=2.26.2,<3.0.0) ; extra == "redis"
28
- Requires-Dist: followthemoney (>=4.1.1,<5.0.0)
30
+ Requires-Dist: followthemoney (>=4.3.2,<5.0.0)
29
31
  Requires-Dist: furl (>=2.1.4,<3.0.0) ; extra == "aleph"
30
- Requires-Dist: nomenklatura (>=4.1.0,<5.0.0)
32
+ Requires-Dist: nomenklatura (>=4.1.10,<5.0.0)
31
33
  Requires-Dist: orjson (>=3.10.18,<4.0.0)
32
- Requires-Dist: pandas (>=2.3.1,<3.0.0) ; extra == "lake"
34
+ Requires-Dist: pandas (>=2.3.3,<3.0.0) ; extra == "lake"
33
35
  Requires-Dist: plyvel (>=1.5.1,<2.0.0) ; extra == "level"
34
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "postgres"
36
+ Requires-Dist: psycopg[pool] (>=3.2.9,<4.0.0) ; extra == "postgres"
35
37
  Requires-Dist: pyarrow (>=21.0.0,<22.0.0) ; extra == "lake"
36
38
  Requires-Dist: pycountry (>=24.6.1,<25.0.0)
37
39
  Requires-Dist: pydantic (>=2.11.3,<3.0.0)
38
40
  Requires-Dist: pyicu (>=2.15.2,<3.0.0)
39
41
  Requires-Dist: redis (>=5.2.1,<6.0.0) ; extra == "redis"
42
+ Requires-Dist: rigour (>=1.4.0,<2.0.0)
40
43
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "postgres"
41
44
  Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "sql"
42
45
  Project-URL: Documentation, https://docs.investigraph.dev/lib/ftmq
@@ -2,7 +2,7 @@ from ftmq.io import smart_read_proxies, smart_write_proxies
2
2
  from ftmq.query import Query
3
3
  from ftmq.util import make_entity
4
4
 
5
- __version__ = "4.1.0"
5
+ __version__ = "4.3.1"
6
6
  __all__ = [
7
7
  "smart_read_proxies",
8
8
  "smart_write_proxies",
@@ -1,3 +1,5 @@
1
+ from datetime import datetime
2
+
1
3
  import click
2
4
  from anystore.io import smart_write, smart_write_json, smart_write_model
3
5
  from click_default_group import DefaultGroup
@@ -11,6 +13,9 @@ from ftmq.model.dataset import Catalog, Dataset
11
13
  from ftmq.model.stats import Collector
12
14
  from ftmq.query import Query
13
15
  from ftmq.store import get_store
16
+ from ftmq.store.fragments import get_fragments
17
+ from ftmq.store.fragments import get_store as get_fragments_store
18
+ from ftmq.store.fragments.settings import Settings as FragmentsSettings
14
19
  from ftmq.util import apply_dataset, parse_unknown_filters
15
20
 
16
21
  log = get_logger(__name__)
@@ -311,6 +316,82 @@ def store_iterate(
311
316
  smart_write_proxies(output_uri, store.iterate())
312
317
 
313
318
 
319
+ @cli.group()
320
+ def fragments():
321
+ pass
322
+
323
+
324
+ fragments_settings = FragmentsSettings()
325
+
326
+
327
+ @fragments.command("list-datasets")
328
+ @click.option(
329
+ "-i",
330
+ "--input-uri",
331
+ default=fragments_settings.database_uri,
332
+ show_default=True,
333
+ help="input file or uri",
334
+ )
335
+ @click.option(
336
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
337
+ )
338
+ def fragments_list_datasets(
339
+ input_uri: str = fragments_settings.database_uri,
340
+ output_uri: str = "-",
341
+ ):
342
+ """
343
+ List datasets within a fragments store
344
+ """
345
+ store = get_fragments_store(input_uri)
346
+ datasets = [ds.name for ds in store.all()]
347
+ smart_write(output_uri, "\n".join(datasets).encode() + b"\n")
348
+
349
+
350
+ @fragments.command("iterate")
351
+ @click.option(
352
+ "-i",
353
+ "--input-uri",
354
+ default=fragments_settings.database_uri,
355
+ show_default=True,
356
+ help="fragments store input uri",
357
+ )
358
+ @click.option(
359
+ "-o", "--output-uri", default="-", show_default=True, help="output file or uri"
360
+ )
361
+ @click.option("-d", "--dataset", required=True, help="Dataset name to iterate")
362
+ @click.option("-s", "--schema", default=None, help="Filter by schema")
363
+ @click.option(
364
+ "--since",
365
+ default=None,
366
+ help="Filter by timestamp (since), ISO format: YYYY-MM-DDTHH:MM:SS",
367
+ )
368
+ @click.option(
369
+ "--until",
370
+ default=None,
371
+ help="Filter by timestamp (until), ISO format: YYYY-MM-DDTHH:MM:SS",
372
+ )
373
+ def fragments_iterate(
374
+ input_uri: str = fragments_settings.database_uri,
375
+ output_uri: str = "-",
376
+ dataset: str = None,
377
+ schema: str | None = None,
378
+ since: str | None = None,
379
+ until: str | None = None,
380
+ ):
381
+ """
382
+ Iterate all entities from a fragments dataset
383
+ """
384
+ fragments = get_fragments(dataset, database_uri=input_uri)
385
+
386
+ # Parse timestamp strings to datetime objects
387
+ since_dt = datetime.fromisoformat(since) if since else None
388
+ until_dt = datetime.fromisoformat(until) if until else None
389
+
390
+ smart_write_proxies(
391
+ output_uri, fragments.iterate(schema=schema, since=since_dt, until=until_dt)
392
+ )
393
+
394
+
314
395
  @cli.command("aggregate")
315
396
  @click.option(
316
397
  "-i", "--input-uri", default="-", show_default=True, help="input file or uri"
@@ -125,6 +125,21 @@ class DatasetFilter(BaseFilter):
125
125
  return False
126
126
 
127
127
 
128
+ class OriginFilter(BaseFilter):
129
+ key = "origin"
130
+
131
+ def apply(self, entity: Entity) -> bool:
132
+ if not hasattr(entity, "context"):
133
+ return False
134
+ origins = ensure_list(entity.context.get("origin"))
135
+ if self.comparator == Lookup.EQUALS:
136
+ return self.value in origins
137
+ for value in origins:
138
+ if self.lookup.apply(value):
139
+ return True
140
+ return False
141
+
142
+
128
143
  class SchemaFilter(BaseFilter):
129
144
  key = "schema"
130
145
 
@@ -232,4 +247,5 @@ FILTERS = {
232
247
  "reverse": ReverseFilter,
233
248
  "entity_id": EntityIdFilter,
234
249
  "canonical_id": CanonicalIdFilter,
250
+ "origin": OriginFilter,
235
251
  }
@@ -127,8 +127,6 @@ def smart_write_proxies(
127
127
  for proxy in proxies:
128
128
  ix += 1
129
129
  bulk.add_entity(proxy)
130
- if ix % 1_000 == 0:
131
- log.info("Writing proxy %d ..." % ix)
132
130
  return ix
133
131
 
134
132
  with smart_open(uri, mode=mode) as fh:
@@ -19,7 +19,7 @@ class Dataset(BaseModel, _DatasetModel):
19
19
  maintainer: DataPublisher | None = None
20
20
  stats: DatasetStats = DatasetStats()
21
21
  git_repo: AnyUrl | None = None
22
- content_type: ContentType | None = "structured"
22
+ content_type: ContentType | None = None
23
23
  uri: str | None = None
24
24
 
25
25
  def iterate(self) -> Entities:
@@ -4,6 +4,7 @@ from typing import Any, Self, TypeVar
4
4
 
5
5
  from banal import ensure_list, is_listish, is_mapping
6
6
  from followthemoney import registry
7
+ from sqlalchemy import Table
7
8
 
8
9
  from ftmq.aggregations import Aggregation, Aggregator
9
10
  from ftmq.enums import Aggregations, Properties
@@ -12,6 +13,7 @@ from ftmq.filters import (
12
13
  DatasetFilter,
13
14
  F,
14
15
  IdFilter,
16
+ OriginFilter,
15
17
  PropertyFilter,
16
18
  ReverseFilter,
17
19
  SchemaFilter,
@@ -57,12 +59,14 @@ class Query:
57
59
  aggregator: Aggregator | None = None,
58
60
  sort: Sort | None = None,
59
61
  slice: Slice | None = None,
62
+ table: Table | None = None,
60
63
  ):
61
64
  self.filters = set(ensure_list(filters))
62
65
  self.aggregations = set(ensure_list(aggregations))
63
66
  self.aggregator = aggregator
64
67
  self.sort = sort
65
68
  self.slice = slice
69
+ self.table = table
66
70
 
67
71
  def __getitem__(self, value: Any) -> Self:
68
72
  """
@@ -213,6 +217,23 @@ class Query:
213
217
  names.update(ensure_list([s.name for s in f.schemata]))
214
218
  return names
215
219
 
220
+ @property
221
+ def origins(self) -> set[OriginFilter]:
222
+ """
223
+ The current filtered origins
224
+ """
225
+ return {f for f in self.filters if isinstance(f, OriginFilter)}
226
+
227
+ @property
228
+ def origin_names(self) -> set[str]:
229
+ """
230
+ The names of the current filtered origins
231
+ """
232
+ names = set()
233
+ for f in self.origins:
234
+ names.update(ensure_list(f.value))
235
+ return names
236
+
216
237
  @property
217
238
  def countries(self) -> set[str]:
218
239
  """
@@ -32,7 +32,7 @@ from ftmq.enums import (
32
32
  from ftmq.filters import F
33
33
 
34
34
  if TYPE_CHECKING:
35
- from ftmq.query import Q
35
+ from ftmq.query import Query
36
36
 
37
37
 
38
38
  Field: TypeAlias = Properties | PropertyTypes | Fields
@@ -50,10 +50,13 @@ class Sql:
50
50
  Comparators.lte: "__le__",
51
51
  }
52
52
 
53
- def __init__(self, q: "Q") -> None:
53
+ def __init__(self, q: "Query") -> None:
54
54
  self.q = q
55
55
  self.metadata = MetaData()
56
- self.table = make_statement_table(self.metadata)
56
+ if q.table is None:
57
+ self.table = make_statement_table(self.metadata)
58
+ else:
59
+ self.table = q.table
57
60
  self.META_COLUMNS = {
58
61
  "id": self.table.c.canonical_id,
59
62
  "dataset": self.table.c.dataset,
@@ -92,6 +95,13 @@ class Sql:
92
95
  for f in sorted(self.q.schemata)
93
96
  )
94
97
  )
98
+ if self.q.origins:
99
+ clauses.append(
100
+ or_(
101
+ self.get_expression(self.table.c.origin, f)
102
+ for f in sorted(self.q.origins)
103
+ )
104
+ )
95
105
  if self.q.reversed:
96
106
  rclause = or_(
97
107
  and_(
@@ -1,7 +1,7 @@
1
- from functools import cache
2
1
  from pathlib import Path
3
2
  from urllib.parse import urlparse
4
3
 
4
+ from anystore.functools import weakref_cache as cache
5
5
  from anystore.types import Uri
6
6
  from followthemoney.dataset.dataset import Dataset
7
7
  from nomenklatura import Resolver, settings
@@ -1,7 +1,7 @@
1
- from functools import cache
2
1
  from typing import Generator, Iterable
3
2
  from urllib.parse import urlparse
4
3
 
4
+ from anystore.functools import weakref_cache as cache
5
5
  from followthemoney import DefaultDataset
6
6
  from followthemoney.dataset.dataset import Dataset
7
7
  from nomenklatura import store as nk
@@ -1,4 +1,4 @@
1
- from functools import cache
1
+ from anystore.functools import weakref_cache as cache
2
2
 
3
3
  from ftmq.store.fragments.dataset import Fragments
4
4
  from ftmq.store.fragments.settings import Settings
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from contextlib import contextmanager
2
3
  from datetime import datetime
3
4
  from typing import Generator, Iterable, TypeAlias
4
5
 
@@ -9,6 +10,7 @@ from normality import slugify
9
10
  from sqlalchemy import (
10
11
  JSON,
11
12
  Column,
13
+ Connection,
12
14
  DateTime,
13
15
  String,
14
16
  Table,
@@ -22,22 +24,48 @@ from sqlalchemy.exc import OperationalError
22
24
 
23
25
  from ftmq.store.fragments.loader import BulkLoader
24
26
  from ftmq.store.fragments.utils import NULL_ORIGIN
25
- from ftmq.types import OriginStatements
27
+ from ftmq.types import Statements
26
28
  from ftmq.util import make_dataset
27
29
 
28
30
  log = logging.getLogger(__name__)
29
31
  UNDEFINED = (OperationalError,)
30
32
  try:
31
- from psycopg2.errors import UndefinedTable
33
+ from psycopg.errors import UndefinedTable
32
34
 
33
35
  UNDEFINED = (UndefinedTable, *UNDEFINED)
34
36
  except ImportError:
35
- pass
37
+ try:
38
+ from psycopg2.errors import UndefinedTable
39
+
40
+ UNDEFINED = (UndefinedTable, *UNDEFINED)
41
+ except ImportError:
42
+ pass
36
43
 
37
44
 
38
45
  EntityFragments: TypeAlias = Generator[EntityProxy, None, None]
39
46
 
40
47
 
48
+ @contextmanager
49
+ def disable_timeout(conn: Connection, store):
50
+ # for long running iterations (e.g. re-index in OpenAleph), for postgres we
51
+ # don't want to get cancelled if a idle_in_transaction_timeout is configured
52
+ # on the server
53
+ if store.is_postgres:
54
+ raw_conn = conn.connection.driver_connection
55
+ with raw_conn.cursor() as cursor:
56
+ cursor.execute("SET idle_in_transaction_session_timeout = 0")
57
+ try:
58
+ yield conn
59
+ finally:
60
+ if store.is_postgres:
61
+ try:
62
+ raw_conn = conn.connection.driver_connection
63
+ with raw_conn.cursor() as cursor:
64
+ cursor.execute("SET idle_in_transaction_session_timeout = DEFAULT")
65
+ except Exception:
66
+ pass # Connection might be closed
67
+
68
+
41
69
  class Fragments(object):
42
70
  def __init__(self, store, name, origin=NULL_ORIGIN):
43
71
  self.store = store
@@ -104,7 +132,9 @@ class Fragments(object):
104
132
  def bulk(self, size=1000):
105
133
  return BulkLoader(self, size)
106
134
 
107
- def fragments(self, entity_ids=None, fragment=None):
135
+ def fragments(
136
+ self, entity_ids=None, fragment=None, schema=None, since=None, until=None
137
+ ):
108
138
  stmt = self.table.select()
109
139
  entity_ids = ensure_list(entity_ids)
110
140
  if len(entity_ids) == 1:
@@ -113,25 +143,42 @@ class Fragments(object):
113
143
  stmt = stmt.where(self.table.c.id.in_(entity_ids))
114
144
  if fragment is not None:
115
145
  stmt = stmt.where(self.table.c.fragment == fragment)
146
+ if schema is not None:
147
+ if self.store.is_postgres:
148
+ stmt = stmt.where(self.table.c.entity["schema"].astext == schema)
149
+ else:
150
+ # SQLite JSON support - use json_extract function
151
+ stmt = stmt.where(
152
+ func.json_extract(self.table.c.entity, "$.schema") == schema
153
+ )
154
+ if since is not None:
155
+ stmt = stmt.where(self.table.c.timestamp >= since)
156
+ if until is not None:
157
+ stmt = stmt.where(self.table.c.timestamp <= until)
116
158
  stmt = stmt.order_by(self.table.c.id)
117
159
  # stmt = stmt.order_by(self.table.c.origin)
118
160
  # stmt = stmt.order_by(self.table.c.fragment)
119
161
  conn = self.store.engine.connect()
120
162
  try:
121
- conn = conn.execution_options(stream_results=True)
122
- for ent in conn.execute(stmt):
123
- data = {"id": ent.id, "datasets": [self.name], **ent.entity}
124
- if ent.origin != NULL_ORIGIN:
125
- data["origin"] = ent.origin
126
- yield data
163
+ with disable_timeout(conn, self.store) as conn:
164
+ conn = conn.execution_options(stream_results=True)
165
+ for ent in conn.execute(stmt):
166
+ data = {"id": ent.id, "datasets": [self.name], **ent.entity}
167
+ if ent.origin != NULL_ORIGIN:
168
+ data["origin"] = ent.origin
169
+ yield data
127
170
  except Exception:
128
171
  self.reset()
129
172
  raise
130
173
  finally:
131
174
  conn.close()
132
175
 
133
- def partials(self, entity_id=None, skip_errors=False) -> EntityFragments:
134
- for fragment in self.fragments(entity_ids=entity_id):
176
+ def partials(
177
+ self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
178
+ ) -> EntityFragments:
179
+ for fragment in self.fragments(
180
+ entity_ids=entity_id, schema=schema, since=since, until=until
181
+ ):
135
182
  try:
136
183
  yield EntityProxy.from_dict(fragment, cleaned=True)
137
184
  except Exception:
@@ -140,18 +187,32 @@ class Fragments(object):
140
187
  continue
141
188
  raise
142
189
 
143
- def iterate(self, entity_id=None, skip_errors=False) -> EntityFragments:
190
+ def iterate(
191
+ self, entity_id=None, skip_errors=False, schema=None, since=None, until=None
192
+ ) -> EntityFragments:
193
+ if entity_id is None:
194
+ log.info("Using batched iteration for complete dataset.")
195
+ yield from self.iterate_batched(
196
+ skip_errors=skip_errors, schema=schema, since=since, until=until
197
+ )
198
+ return
144
199
  entity = None
145
200
  invalid = None
146
201
  fragments = 1
147
- for partial in self.partials(entity_id=entity_id, skip_errors=skip_errors):
202
+ for partial in self.partials(
203
+ entity_id=entity_id,
204
+ skip_errors=skip_errors,
205
+ schema=schema,
206
+ since=since,
207
+ until=until,
208
+ ):
148
209
  if partial.id == invalid:
149
210
  continue
150
211
  if entity is not None:
151
212
  if entity.id == partial.id:
152
213
  fragments += 1
153
214
  if fragments % 10000 == 0:
154
- log.debug(
215
+ log.warning(
155
216
  "[%s:%s] aggregated %d fragments...",
156
217
  entity.schema.name,
157
218
  entity.id,
@@ -176,13 +237,78 @@ class Fragments(object):
176
237
  if entity is not None:
177
238
  yield entity
178
239
 
179
- def origin_statements(
240
+ def iterate_batched(
241
+ self, skip_errors=False, batch_size=10_000, schema=None, since=None, until=None
242
+ ) -> EntityFragments:
243
+ """
244
+ For large datasets an overall sort is not feasible, so we iterate in
245
+ sorted batched IDs.
246
+ """
247
+ for entity_ids in self.get_sorted_id_batches(
248
+ batch_size, schema=schema, since=since, until=until
249
+ ):
250
+ yield from self.iterate(
251
+ entity_id=entity_ids,
252
+ skip_errors=skip_errors,
253
+ schema=schema,
254
+ since=since,
255
+ until=until,
256
+ )
257
+
258
+ def get_sorted_id_batches(
259
+ self, batch_size=10_000, schema=None, since=None, until=None
260
+ ) -> Generator[list[str], None, None]:
261
+ """
262
+ Get sorted ID batches to speed up iteration and useful to parallelize
263
+ processing of iterator Entities
264
+ """
265
+ last_id = None
266
+ with self.store.engine.connect() as conn:
267
+ while True:
268
+ stmt = select(self.table.c.id).distinct()
269
+ if last_id is not None:
270
+ stmt = stmt.where(self.table.c.id > last_id)
271
+ if schema is not None:
272
+ if self.store.is_postgres:
273
+ stmt = stmt.where(
274
+ self.table.c.entity["schema"].astext == schema
275
+ )
276
+ else:
277
+ # SQLite JSON support - use json_extract function
278
+ stmt = stmt.where(
279
+ func.json_extract(self.table.c.entity, "$.schema") == schema
280
+ )
281
+ if since is not None:
282
+ stmt = stmt.where(self.table.c.timestamp >= since)
283
+ if until is not None:
284
+ stmt = stmt.where(self.table.c.timestamp <= until)
285
+ stmt = stmt.order_by(self.table.c.id).limit(batch_size)
286
+ try:
287
+ res = conn.execute(stmt)
288
+ entity_ids = [r.id for r in res.fetchall()]
289
+ if not entity_ids:
290
+ return
291
+ yield entity_ids
292
+ last_id = entity_ids[-1]
293
+ except Exception:
294
+ self.reset()
295
+ raise
296
+
297
+ def get_sorted_ids(
298
+ self, batch_size=10_000, schema=None, since=None, until=None
299
+ ) -> Generator[str, None, None]:
300
+ """Get sorted IDs, optionally filtered by schema"""
301
+ for batch in self.get_sorted_id_batches(batch_size, schema, since, until):
302
+ yield from batch
303
+
304
+ def statements(
180
305
  self,
181
306
  entity_ids: Iterable[str] | None = None,
182
307
  origin: str | None = None,
183
308
  since: datetime | None = None,
184
- ) -> OriginStatements:
185
- """Iterate unsorted statements with its origins: (Statement, origin)"""
309
+ until: datetime | None = None,
310
+ ) -> Statements:
311
+ """Iterate unsorted statements with its fragment origins"""
186
312
  stmt = self.table.select()
187
313
  entity_ids = ensure_list(entity_ids)
188
314
  if len(entity_ids) == 1:
@@ -192,7 +318,9 @@ class Fragments(object):
192
318
  if origin is not None:
193
319
  stmt = stmt.where(self.table.c.origin == origin)
194
320
  if since is not None:
195
- stmt = stmt.where(self.table.c.timestamp > since)
321
+ stmt = stmt.where(self.table.c.timestamp >= since)
322
+ if until is not None:
323
+ stmt = stmt.where(self.table.c.timestamp <= until)
196
324
  conn = self.store.engine.connect()
197
325
  default_dataset = make_dataset(self.name)
198
326
  try:
@@ -204,8 +332,10 @@ class Fragments(object):
204
332
  )
205
333
  for statement in entity.statements:
206
334
  statement.last_seen = fragment.timestamp.isoformat()
207
- origin = fragment.origin if fragment.origin != NULL_ORIGIN else None
208
- yield statement, origin
335
+ statement.origin = (
336
+ fragment.origin if fragment.origin != NULL_ORIGIN else None
337
+ )
338
+ yield statement
209
339
  except Exception:
210
340
  self.reset()
211
341
  raise
@@ -26,11 +26,16 @@ EXCEPTIONS = (
26
26
  TimeoutError,
27
27
  )
28
28
  try:
29
- from psycopg2 import DatabaseError, OperationalError
29
+ from psycopg import DatabaseError, OperationalError
30
30
 
31
31
  EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
32
32
  except ImportError:
33
- pass
33
+ try:
34
+ from psycopg2 import DatabaseError, OperationalError
35
+
36
+ EXCEPTIONS = (DatabaseError, OperationalError, *EXCEPTIONS)
37
+ except ImportError:
38
+ pass
34
39
 
35
40
  log = logging.getLogger(__name__)
36
41
 
@@ -50,9 +55,12 @@ class BulkLoader(object):
50
55
  else:
51
56
  entity = dict(entity)
52
57
  id_ = entity.pop("id")
53
- self.buffer[(id_, origin, fragment)] = entity
54
- if len(self.buffer) >= self.size:
55
- self.flush()
58
+ if id_:
59
+ self.buffer[(id_, origin, fragment)] = entity
60
+ if len(self.buffer) >= self.size:
61
+ self.flush()
62
+ else:
63
+ log.warning("Entity has no ID!")
56
64
 
57
65
  def _store_values(self, conn, values):
58
66
  table = self.dataset.table
@@ -0,0 +1,71 @@
1
+ from sqlalchemy import MetaData, create_engine
2
+ from sqlalchemy import inspect as sqlalchemy_inspect
3
+
4
+ from ftmq.store.fragments.dataset import Fragments
5
+ from ftmq.store.fragments.utils import NULL_ORIGIN
6
+
7
+
8
+ class Store(object):
9
+ """A database containing multiple tables that represent
10
+ FtM-store datasets."""
11
+
12
+ PREFIX = "ftm"
13
+
14
+ def _adjust_psycopg3_uri(self, database_uri: str) -> str:
15
+ """Adjust PostgreSQL URI to use psycopg3 dialect if psycopg is available."""
16
+ if database_uri.startswith(("postgresql://", "postgres://")):
17
+ try:
18
+ import psycopg # noqa: F401
19
+
20
+ # Use psycopg3 dialect for better performance and compatibility
21
+ if database_uri.startswith("postgresql://"):
22
+ return database_uri.replace(
23
+ "postgresql://", "postgresql+psycopg://", 1
24
+ )
25
+ elif database_uri.startswith("postgres://"):
26
+ return database_uri.replace(
27
+ "postgres://", "postgresql+psycopg://", 1
28
+ )
29
+ except ImportError:
30
+ # Fall back to psycopg2 if psycopg3 is not available
31
+ pass
32
+ return database_uri
33
+
34
+ def __init__(
35
+ self,
36
+ database_uri: str,
37
+ **config,
38
+ ):
39
+ self.database_uri = self._adjust_psycopg3_uri(database_uri)
40
+
41
+ # Configure connection pooling for psycopg3
42
+ config.setdefault("pool_size", 1)
43
+ if self.database_uri.startswith("postgresql+psycopg://"):
44
+ config.setdefault("max_overflow", 5)
45
+ config.setdefault("pool_timeout", 60)
46
+ config.setdefault("pool_recycle", 3600)
47
+ config.setdefault("pool_pre_ping", True)
48
+
49
+ self.engine = create_engine(self.database_uri, future=True, **config)
50
+ self.is_postgres = self.engine.dialect.name == "postgresql"
51
+ self.meta = MetaData()
52
+
53
+ def get(self, name, origin=NULL_ORIGIN):
54
+ return Fragments(self, name, origin=origin)
55
+
56
+ def all(self, origin=NULL_ORIGIN):
57
+ prefix = f"{self.PREFIX}_"
58
+ inspect = sqlalchemy_inspect(self.engine)
59
+ for table in inspect.get_table_names():
60
+ if table.startswith(prefix):
61
+ name = table[len(prefix) :]
62
+ yield Fragments(self, name, origin=origin)
63
+
64
+ def close(self):
65
+ self.engine.dispose()
66
+
67
+ def __len__(self):
68
+ return len(list(self.all()))
69
+
70
+ def __repr__(self):
71
+ return "<Store(%r)>" % self.engine
@@ -18,13 +18,14 @@ Layout:
18
18
  ```
19
19
  """
20
20
 
21
- from functools import cache
22
21
  from pathlib import Path
23
22
  from typing import Any, Generator, Iterable
23
+ from urllib.parse import urlparse
24
24
 
25
25
  import duckdb
26
26
  import numpy as np
27
27
  import pandas as pd
28
+ from anystore.functools import weakref_cache as cache
28
29
  from anystore.lock import Lock
29
30
  from anystore.logging import get_logger
30
31
  from anystore.store.fs import Store as FSStore
@@ -52,26 +53,35 @@ from sqlalchemy.sql import Select
52
53
  from ftmq.query import Query
53
54
  from ftmq.store.base import Store
54
55
  from ftmq.store.sql import SQLQueryView, SQLStore
55
- from ftmq.types import OriginStatement, StatementEntities
56
- from ftmq.util import ensure_entity, get_scope_dataset
56
+ from ftmq.types import StatementEntities
57
+ from ftmq.util import apply_dataset, ensure_entity, get_scope_dataset
57
58
 
58
59
  log = get_logger(__name__)
59
60
 
60
- Z_ORDER = ["canonical_id", "schema", "prop", "value"]
61
+ Z_ORDER = ["canonical_id", "entity_id", "schema", "prop"]
62
+ TARGET_SIZE = 50 * 10_485_760 # 500 MB
61
63
  PARTITION_BY = ["dataset", "bucket", "origin"]
62
64
  DEFAULT_ORIGIN = "default"
63
65
  BUCKET_DOCUMENT = "document"
64
66
  BUCKET_INTERVAL = "interval"
65
67
  BUCKET_THING = "thing"
66
- BLOOM = ColumnProperties(bloom_filter_properties=BloomFilterProperties(True))
68
+ STATISTICS_BLOOM = ColumnProperties(
69
+ bloom_filter_properties=BloomFilterProperties(True),
70
+ statistics_enabled="CHUNK",
71
+ dictionary_enabled=True,
72
+ )
73
+ STATISTICS = ColumnProperties(statistics_enabled="CHUNK", dictionary_enabled=True)
67
74
  WRITER = WriterProperties(
75
+ data_page_size_limit=64 * 1024,
76
+ dictionary_page_size_limit=512 * 1024,
77
+ max_row_group_size=500_000,
68
78
  compression="SNAPPY",
69
79
  column_properties={
70
- "canonical_id": BLOOM,
71
- "entity_id": BLOOM,
72
- "schema": BLOOM,
73
- "prop": BLOOM,
74
- "value": BLOOM,
80
+ "canonical_id": STATISTICS,
81
+ "entity_id": STATISTICS,
82
+ "schema": STATISTICS,
83
+ "prop": STATISTICS_BLOOM,
84
+ "value": STATISTICS_BLOOM,
75
85
  },
76
86
  )
77
87
 
@@ -111,6 +121,13 @@ class StorageSettings(BaseSettings):
111
121
  return not self.endpoint.startswith("https")
112
122
  return False
113
123
 
124
+ @property
125
+ def duckdb_endpoint(self) -> str | None:
126
+ if not self.endpoint:
127
+ return
128
+ scheme = urlparse(self.endpoint).scheme
129
+ return self.endpoint[len(scheme) + len("://") :]
130
+
114
131
 
115
132
  storage_settings = StorageSettings()
116
133
 
@@ -154,16 +171,15 @@ def get_schema_bucket(schema_name: str) -> str:
154
171
  return BUCKET_THING
155
172
 
156
173
 
157
- def pack_statement(stmt: Statement, origin: str) -> SDict:
174
+ def pack_statement(stmt: Statement) -> SDict:
158
175
  data = stmt.to_db_row()
159
- data["origin"] = origin
160
176
  data["bucket"] = get_schema_bucket(data["schema"])
161
177
  return data
162
178
 
163
179
 
164
- def pack_statements(statements: Iterable[OriginStatement]) -> pd.DataFrame:
165
- df = pd.DataFrame(pack_statement(*s) for s in statements)
166
- df = df.drop_duplicates().sort_values(Z_ORDER)
180
+ def pack_statements(statements: Iterable[Statement]) -> pd.DataFrame:
181
+ df = pd.DataFrame(map(pack_statement, statements))
182
+ df = df.drop_duplicates() # .sort_values(Z_ORDER)
167
183
  df = df.fillna(np.nan)
168
184
  return df
169
185
 
@@ -213,9 +229,10 @@ def ensure_schema_buckets(q: Query) -> Select:
213
229
  class LakeQueryView(SQLQueryView):
214
230
  def query(self, query: Query | None = None) -> StatementEntities:
215
231
  if query:
232
+ query.table = self.store.table
216
233
  query = self.ensure_scoped_query(query)
217
- # sql = ensure_schema_buckets(query)
218
- yield from self.store._iterate(query.sql.statements)
234
+ sql = ensure_schema_buckets(query)
235
+ yield from self.store._iterate(sql)
219
236
  else:
220
237
  yield from super().query(query)
221
238
 
@@ -225,6 +242,7 @@ class LakeStore(SQLStore):
225
242
  self._backend: FSStore = FSStore(uri=kwargs.pop("uri"))
226
243
  self._partition_by = kwargs.pop("partition_by", PARTITION_BY)
227
244
  self._lock: Lock = kwargs.pop("lock", Lock(self._backend))
245
+ self._enforce_dataset = kwargs.pop("enforce_dataset", False)
228
246
  assert isinstance(
229
247
  self._backend, FSStore
230
248
  ), f"Invalid store backend: `{self._backend.__class__}"
@@ -235,12 +253,13 @@ class LakeStore(SQLStore):
235
253
  self.uri = self._backend.uri
236
254
  setup_duckdb_storage()
237
255
 
238
- def get_deltatable(self) -> DeltaTable:
256
+ @property
257
+ def deltatable(self) -> DeltaTable:
239
258
  return DeltaTable(self.uri, storage_options=storage_options())
240
259
 
241
260
  def _execute(self, q: Select, stream: bool = True) -> Generator[Any, None, None]:
242
261
  try:
243
- yield from stream_duckdb(q, self.get_deltatable())
262
+ yield from stream_duckdb(q, self.deltatable)
244
263
  except TableNotFoundError:
245
264
  pass
246
265
 
@@ -265,7 +284,7 @@ class LakeStore(SQLStore):
265
284
 
266
285
  def get_origins(self) -> set[str]:
267
286
  q = select(self.table.c.origin).distinct()
268
- return set([r.origin for r in stream_duckdb(q, self.get_deltatable())])
287
+ return set([r.origin for r in stream_duckdb(q, self.deltatable)])
269
288
 
270
289
 
271
290
  class LakeWriter(nk.Writer):
@@ -274,21 +293,27 @@ class LakeWriter(nk.Writer):
274
293
 
275
294
  def __init__(self, store: Store, origin: str | None = DEFAULT_ORIGIN):
276
295
  super().__init__(store)
277
- self.batch: set[OriginStatement] = set()
296
+ self.batch: set[Statement] = set()
278
297
  self.origin = origin or DEFAULT_ORIGIN
279
298
 
280
- def add_statement(self, stmt: Statement, origin: str | None) -> None:
299
+ def add_statement(self, stmt: Statement) -> None:
281
300
  if stmt.entity_id is None:
282
301
  return
283
- origin = origin or self.origin
302
+ stmt.origin = stmt.origin or self.origin
284
303
  canonical_id = self.store.linker.get_canonical(stmt.entity_id)
285
304
  stmt.canonical_id = canonical_id
286
- self.batch.add((stmt, origin))
305
+ self.batch.add(stmt)
287
306
 
288
307
  def add_entity(self, entity: EntityProxy, origin: str | None = None) -> None:
289
308
  e = ensure_entity(entity, StatementEntity, self.store.dataset)
309
+ if self.store._enforce_dataset:
310
+ e = apply_dataset(e, self.store.dataset, replace=True)
290
311
  for stmt in e.statements:
291
- self.add_statement(stmt, origin)
312
+ if origin:
313
+ stmt.origin = origin
314
+ self.add_statement(stmt)
315
+ # we check here instead of in `add_statement` as this will keep entities
316
+ # together in the same parquet files`
292
317
  if len(self.batch) >= self.BATCH_STATEMENTS:
293
318
  self.flush()
294
319
 
@@ -304,8 +329,10 @@ class LakeWriter(nk.Writer):
304
329
  pack_statements(self.batch),
305
330
  partition_by=self.store._partition_by,
306
331
  mode="append",
307
- writer_properties=WRITER,
308
332
  schema_mode="merge",
333
+ writer_properties=WRITER,
334
+ target_file_size=TARGET_SIZE,
335
+ storage_options=storage_options(),
309
336
  )
310
337
 
311
338
  self.batch = set()
@@ -317,8 +344,7 @@ class LakeWriter(nk.Writer):
317
344
  for row in self.store._execute(q):
318
345
  statements.append(Statement.from_db_row(row))
319
346
 
320
- table = self.store.get_deltatable()
321
- table.delete(f"canonical_id = '{entity_id}'")
347
+ self.store.deltatable.delete(f"canonical_id = '{entity_id}'")
322
348
  return statements
323
349
 
324
350
  def optimize(
@@ -327,10 +353,11 @@ class LakeWriter(nk.Writer):
327
353
  """
328
354
  Optimize the storage: Z-Ordering and compacting
329
355
  """
330
- table = self.store.get_deltatable()
331
- table.optimize.z_order(Z_ORDER, writer_properties=WRITER)
356
+ self.store.deltatable.optimize.z_order(
357
+ Z_ORDER, writer_properties=WRITER, target_size=TARGET_SIZE
358
+ )
332
359
  if vacuum:
333
- table.vacuum(
360
+ self.store.deltatable.vacuum(
334
361
  retention_hours=vacuum_keep_hours,
335
362
  enforce_retention_duration=False,
336
363
  dry_run=False,
@@ -20,7 +20,3 @@ ValueEntities: TypeAlias = Generator[ValueEntity, None, None]
20
20
  # statements
21
21
  Statements: TypeAlias = Generator[Statement, None, None]
22
22
  """A generator for Statement instances"""
23
- OriginStatement: TypeAlias = tuple[Statement, str | None]
24
- """A statement with its origin"""
25
- OriginStatements: TypeAlias = Generator[OriginStatement, None, None]
26
- """A generator for OriginStatement instances"""
@@ -1,17 +1,22 @@
1
- from functools import cache, lru_cache
1
+ from functools import lru_cache
2
2
  from typing import Any, Generator, Type
3
3
 
4
4
  import pycountry
5
- from anystore.types import SDict
5
+ from anystore.functools import weakref_cache as cache
6
+ from anystore.types import SDict, StrGenerator
6
7
  from banal import ensure_list, is_listish
7
- from followthemoney import E
8
+ from followthemoney import E, model
9
+ from followthemoney.compare import _normalize_names
8
10
  from followthemoney.dataset import Dataset
9
11
  from followthemoney.entity import ValueEntity
10
12
  from followthemoney.proxy import EntityProxy
11
13
  from followthemoney.schema import Schema
12
14
  from followthemoney.types import registry
13
15
  from followthemoney.util import make_entity_id, sanitize_text
14
- from normality import collapse_spaces, slugify
16
+ from normality import collapse_spaces, latinize_text, slugify
17
+ from rigour.names import Name, Symbol, tag_org_name, tag_person_name
18
+ from rigour.names.tokenize import normalize_name
19
+ from rigour.text.scripts import can_latinize
15
20
 
16
21
  from ftmq.enums import Comparators
17
22
  from ftmq.types import Entity
@@ -91,6 +96,8 @@ def make_entity(
91
96
  etype = entity_type or ValueEntity
92
97
  if data.get("id") is None:
93
98
  raise ValueError("Entity has no ID.")
99
+ if etype == EntityProxy:
100
+ return EntityProxy.from_dict(data)
94
101
  if etype == ValueEntity:
95
102
  if not data.get("datasets"):
96
103
  dataset = make_dataset(default_dataset).name
@@ -373,6 +380,24 @@ def make_fingerprint(value: Any) -> str | None:
373
380
  return " ".join(sorted(set(slugify(value).split("-"))))
374
381
 
375
382
 
383
+ def entity_fingerprints(entity: EntityProxy) -> set[str]:
384
+ """Get the set of entity name fingerprints, latinized if the alphabet allows
385
+ it and with org / person tags removed depending on entity schema"""
386
+ return make_fingerprints(*entity.names, schemata={entity.schema})
387
+
388
+
389
+ def make_fingerprints(*names: str, schemata: set[Schema] | None = None) -> set[str]:
390
+ """Get the set of name fingerprints, latinized if the alphabet allows
391
+ it and with org / person tags removed depending on given schemata"""
392
+ # FIXME private import
393
+ schemata = schemata or {model["LegalEntity"]}
394
+ fps: set[str] = set()
395
+ for schema in schemata:
396
+ fps.update(set(_normalize_names(schema, names)))
397
+ # add latinized if appropriate
398
+ return {latinize_text(fp) if can_latinize(fp) else fp for fp in fps}
399
+
400
+
376
401
  def make_string_id(*values: Any) -> str | None:
377
402
  """
378
403
  Compute a hash id based on values
@@ -458,3 +483,64 @@ def must_str(value: Any) -> str:
458
483
  if not value:
459
484
  raise ValueError(f"Value invalid: `{value}`")
460
485
  return value
486
+
487
+
488
+ SELECT_SYMBOLS = "__symbols__"
489
+ SELECT_ANNOTATED = "__annotated__"
490
+
491
+
492
+ def get_name_symbols(schema: Schema, *names: str) -> set[Symbol]:
493
+ """Get the rigour names symbols for the given schema and list of names"""
494
+ symbols: set[Symbol] = set()
495
+ if schema.is_a("Person"):
496
+ taggers = [tag_person_name]
497
+ elif schema.is_a("Organization"):
498
+ taggers = [tag_org_name]
499
+ elif schema.is_a("LegalEntity"):
500
+ taggers = [tag_org_name, tag_person_name]
501
+ else:
502
+ return symbols
503
+ for name in names:
504
+ n = Name(name)
505
+ for tagger in taggers:
506
+ for symbol in tagger(n, normalize_name).symbols:
507
+ symbols.add(symbol)
508
+ return symbols
509
+
510
+
511
+ def get_symbols(entity: EntityProxy) -> set[Symbol]:
512
+ """Get the rigour names symbols for the given entity"""
513
+ if not entity.schema.is_a("LegalEntity"):
514
+ return set()
515
+ names = entity.get_type_values(registry.name, matchable=True)
516
+ return get_name_symbols(entity.schema, *names)
517
+
518
+
519
+ def inline_symbols(entity: EntityProxy) -> None:
520
+ """Get the rigour names symbols for the given entity and write them to `indexText`"""
521
+ # clean up old symbols from indexText:
522
+ for text in entity.pop("indexText"):
523
+ if not text.startswith(SELECT_SYMBOLS):
524
+ entity.add("indexText", text)
525
+ symbols = get_symbols(entity)
526
+ entity.add("indexText", f"{SELECT_SYMBOLS} {','.join(map(str, symbols))}")
527
+
528
+
529
+ def select_data(e: EntityProxy, prefix: str) -> StrGenerator:
530
+ """Select arbitrary stored data in `indexText` identified by given prefix"""
531
+ for text in e.get("indexText", quiet=True):
532
+ if text.startswith(prefix):
533
+ yield text.replace(prefix, "").strip()
534
+
535
+
536
+ def select_symbols(e: EntityProxy) -> set[str]:
537
+ """Select stored symbols in `indexText`"""
538
+ symbols: set[str] = set()
539
+ for data in select_data(e, SELECT_SYMBOLS):
540
+ symbols.update(data.split(","))
541
+ return symbols
542
+
543
+
544
+ def select_annotations(e: EntityProxy) -> set[str]:
545
+ """Select stored annotations in `indexText`"""
546
+ return {s for s in select_data(e, SELECT_ANNOTATED)}
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ftmq"
3
- version = "4.1.0"
3
+ version = "4.3.1"
4
4
  description = "followthemoney query dsl and io helpers"
5
5
  authors = [{ name = "Simon Wörpel", email = "simon.woerpel@pm.me" }]
6
6
  license = "AGPLv3+"
@@ -15,9 +15,10 @@ classifiers = [
15
15
  ]
16
16
  requires-python = ">=3.11,<4"
17
17
  dependencies = [
18
- "anystore (>=0.3.9,<0.4.0)",
19
- "followthemoney (>=4.1.1,<5.0.0)",
20
- "nomenklatura (>=4.1.0,<5.0.0)",
18
+ "anystore (>=0.4.0,<0.5.0)",
19
+ "followthemoney (>=4.3.2,<5.0.0)",
20
+ "nomenklatura (>=4.1.10,<5.0.0)",
21
+ "rigour (>=1.4.0,<2.0.0)",
21
22
  "click (>=8.2.1,<9.0.0)",
22
23
  "click-default-group (>=1.2.4,<2.0.0)",
23
24
  "orjson (>=3.10.18,<4.0.0)",
@@ -29,12 +30,12 @@ dependencies = [
29
30
  [project.optional-dependencies]
30
31
  level = ["plyvel (>=1.5.1,<2.0.0)"]
31
32
  sql = ["sqlalchemy (>=2.0.36,<3.0.0)"]
32
- postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg2 (>=2.9.10,<3.0.0)"]
33
+ postgres = ["sqlalchemy (>=2.0.36,<3.0.0)", "psycopg[pool] (>=3.2.9,<4.0.0)"]
33
34
  redis = ["redis (>=5.2.1,<6.0.0)", "fakeredis (>=2.26.2,<3.0.0)"]
34
35
  lake = [
35
- "duckdb (>=1.3.2,<2.0.0)",
36
- "pandas (>=2.3.1,<3.0.0)",
37
- "deltalake (>=1.1.2,<2.0.0)",
36
+ "duckdb (>=1.4.1,<2.0.0)",
37
+ "pandas (>=2.3.3,<3.0.0)",
38
+ "deltalake (>=1.2.1,<2.0.0)",
38
39
  "pyarrow (>=21.0.0,<22.0.0)",
39
40
  ]
40
41
  aleph = ["furl (>=2.1.4,<3.0.0)", "alephclient (>=2.6.0,<3.0.0)"]
@@ -50,19 +51,19 @@ Issues = "https://github.com/dataresearchcenter/ftmq/issues"
50
51
 
51
52
  [tool.poetry.group.dev.dependencies]
52
53
  pytest = ">=7.4.3,<9.0.0"
53
- pytest-cov = ">=4.1,<7.0"
54
+ pytest-cov = ">=4.1,<8.0"
54
55
  pytest-env = "^1.1.1"
55
56
  black = ">=23.11,<26.0"
56
- isort = "^6.0.1"
57
- mypy = "^1.17.0"
57
+ isort = "^7.0.0"
58
+ mypy = "^1.17.1"
58
59
  pre-commit = "^4.0.1"
59
60
  flake8 = ">=6.1,<8.0"
60
61
  ipdb = "^0.13.13"
61
62
  bump2version = "^1.0.1"
62
63
  mkdocs = "^1.6.1"
63
- mkdocstrings-python = "^1.16.10"
64
- mkdocs-autorefs = "^1.4.1"
65
- mkdocs-material = "^9.6.15"
64
+ mkdocs-autorefs = "^1.4.3"
65
+ mkdocstrings-python = "^1.18.2"
66
+ mkdocs-material = "^9.6.18"
66
67
  mkdocs-click = "^0.9.0"
67
68
 
68
69
  [build-system]
@@ -1,43 +0,0 @@
1
- from sqlalchemy import MetaData, create_engine
2
- from sqlalchemy import inspect as sqlalchemy_inspect
3
-
4
- from ftmq.store.fragments.dataset import Fragments
5
- from ftmq.store.fragments.utils import NULL_ORIGIN
6
-
7
-
8
- class Store(object):
9
- """A database containing multiple tables that represent
10
- FtM-store datasets."""
11
-
12
- PREFIX = "ftm"
13
-
14
- def __init__(
15
- self,
16
- database_uri: str,
17
- **config,
18
- ):
19
- self.database_uri = database_uri
20
- # config.setdefault('pool_size', 1)
21
- self.engine = create_engine(database_uri, future=True, **config)
22
- self.is_postgres = self.engine.dialect.name == "postgresql"
23
- self.meta = MetaData()
24
-
25
- def get(self, name, origin=NULL_ORIGIN):
26
- return Fragments(self, name, origin=origin)
27
-
28
- def all(self, origin=NULL_ORIGIN):
29
- prefix = f"{self.PREFIX}_"
30
- inspect = sqlalchemy_inspect(self.engine)
31
- for table in inspect.get_table_names():
32
- if table.startswith(prefix):
33
- name = table[len(prefix) :]
34
- yield Fragments(self, name, origin=origin)
35
-
36
- def close(self):
37
- self.engine.dispose()
38
-
39
- def __len__(self):
40
- return len(list(self.all()))
41
-
42
- def __repr__(self):
43
- return "<Store(%r)>" % self.engine
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes