datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -1,18 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import sqlite3
|
|
4
|
-
from collections.abc import Iterable, Sequence
|
|
4
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from functools import cached_property, wraps
|
|
7
7
|
from time import sleep
|
|
8
|
-
from typing import
|
|
9
|
-
TYPE_CHECKING,
|
|
10
|
-
Any,
|
|
11
|
-
Callable,
|
|
12
|
-
ClassVar,
|
|
13
|
-
Optional,
|
|
14
|
-
Union,
|
|
15
|
-
)
|
|
8
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Union
|
|
16
9
|
|
|
17
10
|
import sqlalchemy
|
|
18
11
|
from sqlalchemy import (
|
|
@@ -27,16 +20,19 @@ from sqlalchemy import (
|
|
|
27
20
|
from sqlalchemy.dialects import sqlite
|
|
28
21
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
29
22
|
from sqlalchemy.sql import func
|
|
30
|
-
from sqlalchemy.sql.elements import
|
|
23
|
+
from sqlalchemy.sql.elements import (
|
|
24
|
+
BinaryExpression,
|
|
25
|
+
BooleanClauseList,
|
|
26
|
+
)
|
|
31
27
|
from sqlalchemy.sql.expression import bindparam, cast
|
|
32
28
|
from sqlalchemy.sql.selectable import Select
|
|
33
29
|
from tqdm.auto import tqdm
|
|
34
30
|
|
|
35
31
|
import datachain.sql.sqlite
|
|
36
|
-
from datachain import semver
|
|
37
32
|
from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
38
33
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
39
34
|
from datachain.data_storage.schema import DefaultSchema
|
|
35
|
+
from datachain.data_storage.warehouse import INSERT_BATCH_SIZE
|
|
40
36
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
41
37
|
from datachain.error import DataChainError, OutdatedDatabaseSchemaError
|
|
42
38
|
from datachain.namespace import Namespace
|
|
@@ -44,9 +40,10 @@ from datachain.project import Project
|
|
|
44
40
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
45
41
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
46
42
|
from datachain.sql.types import SQLType
|
|
47
|
-
from datachain.utils import DataChainDir, batched_it
|
|
43
|
+
from datachain.utils import DataChainDir, batched, batched_it
|
|
48
44
|
|
|
49
45
|
if TYPE_CHECKING:
|
|
46
|
+
from sqlalchemy import CTE, Subquery
|
|
50
47
|
from sqlalchemy.dialects.sqlite import Insert
|
|
51
48
|
from sqlalchemy.engine.base import Engine
|
|
52
49
|
from sqlalchemy.schema import SchemaItem
|
|
@@ -104,8 +101,8 @@ def retry_sqlite_locks(func):
|
|
|
104
101
|
|
|
105
102
|
|
|
106
103
|
def get_db_file_in_memory(
|
|
107
|
-
db_file:
|
|
108
|
-
) ->
|
|
104
|
+
db_file: str | None = None, in_memory: bool = False
|
|
105
|
+
) -> str | None:
|
|
109
106
|
"""Get in-memory db_file and check that conflicting arguments are not provided."""
|
|
110
107
|
if in_memory:
|
|
111
108
|
if db_file and db_file != ":memory:":
|
|
@@ -118,7 +115,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
118
115
|
dialect = sqlite_dialect
|
|
119
116
|
|
|
120
117
|
db: sqlite3.Connection
|
|
121
|
-
db_file:
|
|
118
|
+
db_file: str | None
|
|
122
119
|
is_closed: bool
|
|
123
120
|
|
|
124
121
|
def __init__(
|
|
@@ -126,8 +123,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
126
123
|
engine: "Engine",
|
|
127
124
|
metadata: "MetaData",
|
|
128
125
|
db: sqlite3.Connection,
|
|
129
|
-
db_file:
|
|
130
|
-
max_variable_number:
|
|
126
|
+
db_file: str | None = None,
|
|
127
|
+
max_variable_number: int | None = 999,
|
|
131
128
|
):
|
|
132
129
|
self.engine = engine
|
|
133
130
|
self.metadata = metadata
|
|
@@ -137,12 +134,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
137
134
|
self.max_variable_number = max_variable_number
|
|
138
135
|
|
|
139
136
|
@classmethod
|
|
140
|
-
def from_db_file(cls, db_file:
|
|
137
|
+
def from_db_file(cls, db_file: str | None = None) -> "SQLiteDatabaseEngine":
|
|
141
138
|
return cls(*cls._connect(db_file=db_file))
|
|
142
139
|
|
|
143
140
|
@staticmethod
|
|
144
141
|
def _connect(
|
|
145
|
-
db_file:
|
|
142
|
+
db_file: str | None = None,
|
|
146
143
|
) -> tuple["Engine", "MetaData", sqlite3.Connection, str, int]:
|
|
147
144
|
try:
|
|
148
145
|
if db_file == ":memory:":
|
|
@@ -200,10 +197,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
200
197
|
"""
|
|
201
198
|
return (
|
|
202
199
|
SQLiteDatabaseEngine.from_db_file,
|
|
203
|
-
[self.db_file],
|
|
200
|
+
[str(self.db_file)],
|
|
204
201
|
{},
|
|
205
202
|
)
|
|
206
203
|
|
|
204
|
+
@classmethod
|
|
205
|
+
def serialize_callable_name(cls) -> str:
|
|
206
|
+
return "sqlite.from_db_file"
|
|
207
|
+
|
|
207
208
|
def _reconnect(self) -> None:
|
|
208
209
|
if not self.is_closed:
|
|
209
210
|
raise RuntimeError("Cannot reconnect on still-open DB!")
|
|
@@ -227,7 +228,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
227
228
|
def execute(
|
|
228
229
|
self,
|
|
229
230
|
query,
|
|
230
|
-
cursor:
|
|
231
|
+
cursor: sqlite3.Cursor | None = None,
|
|
231
232
|
conn=None,
|
|
232
233
|
) -> sqlite3.Cursor:
|
|
233
234
|
if self.is_closed:
|
|
@@ -246,7 +247,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
246
247
|
|
|
247
248
|
@retry_sqlite_locks
|
|
248
249
|
def executemany(
|
|
249
|
-
self, query, params, cursor:
|
|
250
|
+
self, query, params, cursor: sqlite3.Cursor | None = None, conn=None
|
|
250
251
|
) -> sqlite3.Cursor:
|
|
251
252
|
if cursor:
|
|
252
253
|
return cursor.executemany(self.compile(query).string, params)
|
|
@@ -290,6 +291,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
290
291
|
return self.db.cursor(factory)
|
|
291
292
|
|
|
292
293
|
def close(self) -> None:
|
|
294
|
+
if self.is_closed:
|
|
295
|
+
return
|
|
293
296
|
self.db.close()
|
|
294
297
|
self.is_closed = True
|
|
295
298
|
|
|
@@ -326,7 +329,13 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
326
329
|
query = "SELECT name FROM sqlite_master WHERE type='table';"
|
|
327
330
|
return [r[0] for r in self.execute_str(query).fetchall()]
|
|
328
331
|
|
|
329
|
-
def create_table(
|
|
332
|
+
def create_table(
|
|
333
|
+
self,
|
|
334
|
+
table: "Table",
|
|
335
|
+
if_not_exists: bool = True,
|
|
336
|
+
*,
|
|
337
|
+
kind: str | None = None,
|
|
338
|
+
) -> None:
|
|
330
339
|
self.execute(CreateTable(table, if_not_exists=if_not_exists))
|
|
331
340
|
|
|
332
341
|
def drop_table(self, table: "Table", if_exists: bool = False) -> None:
|
|
@@ -346,13 +355,13 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
346
355
|
|
|
347
356
|
META_TABLE = "meta"
|
|
348
357
|
|
|
349
|
-
db:
|
|
358
|
+
db: SQLiteDatabaseEngine
|
|
350
359
|
|
|
351
360
|
def __init__(
|
|
352
361
|
self,
|
|
353
|
-
uri:
|
|
354
|
-
db:
|
|
355
|
-
db_file:
|
|
362
|
+
uri: StorageURI | None = None,
|
|
363
|
+
db: SQLiteDatabaseEngine | None = None,
|
|
364
|
+
db_file: str | None = None,
|
|
356
365
|
in_memory: bool = False,
|
|
357
366
|
):
|
|
358
367
|
uri = uri or StorageURI("")
|
|
@@ -367,11 +376,12 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
367
376
|
|
|
368
377
|
self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
|
|
369
378
|
|
|
370
|
-
self.
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
379
|
+
with self._init_guard():
|
|
380
|
+
self._init_meta_table()
|
|
381
|
+
self._init_meta_schema_value()
|
|
382
|
+
self._check_schema_version()
|
|
383
|
+
self._init_tables()
|
|
384
|
+
self._init_namespaces_projects()
|
|
375
385
|
|
|
376
386
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
377
387
|
"""Close connection upon exit from context manager."""
|
|
@@ -379,7 +389,7 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
379
389
|
|
|
380
390
|
def clone(
|
|
381
391
|
self,
|
|
382
|
-
uri:
|
|
392
|
+
uri: StorageURI | None = None,
|
|
383
393
|
use_new_connection: bool = False,
|
|
384
394
|
) -> "SQLiteMetastore":
|
|
385
395
|
uri = uri or StorageURI("")
|
|
@@ -402,6 +412,10 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
402
412
|
},
|
|
403
413
|
)
|
|
404
414
|
|
|
415
|
+
@classmethod
|
|
416
|
+
def serialize_callable_name(cls) -> str:
|
|
417
|
+
return "sqlite.metastore.init_after_clone"
|
|
418
|
+
|
|
405
419
|
@classmethod
|
|
406
420
|
def init_after_clone(
|
|
407
421
|
cls,
|
|
@@ -458,6 +472,10 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
458
472
|
self.default_table_names.append(self._datasets_dependencies.name)
|
|
459
473
|
self.db.create_table(self._jobs, if_not_exists=True)
|
|
460
474
|
self.default_table_names.append(self._jobs.name)
|
|
475
|
+
self.db.create_table(self._checkpoints, if_not_exists=True)
|
|
476
|
+
self.default_table_names.append(self._checkpoints.name)
|
|
477
|
+
self.db.create_table(self._dataset_version_jobs, if_not_exists=True)
|
|
478
|
+
self.default_table_names.append(self._dataset_version_jobs.name)
|
|
461
479
|
|
|
462
480
|
def _init_namespaces_projects(self) -> None:
|
|
463
481
|
"""
|
|
@@ -535,6 +553,26 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
535
553
|
self._datasets_versions.c.created_at,
|
|
536
554
|
]
|
|
537
555
|
|
|
556
|
+
def _dataset_dependency_nodes_select_columns(
|
|
557
|
+
self,
|
|
558
|
+
namespaces_subquery: "Subquery",
|
|
559
|
+
dependency_tree_cte: "CTE",
|
|
560
|
+
datasets_subquery: "Subquery",
|
|
561
|
+
) -> list["ColumnElement"]:
|
|
562
|
+
return [
|
|
563
|
+
namespaces_subquery.c.name,
|
|
564
|
+
self._projects.c.name,
|
|
565
|
+
dependency_tree_cte.c.id,
|
|
566
|
+
dependency_tree_cte.c.dataset_id,
|
|
567
|
+
dependency_tree_cte.c.dataset_version_id,
|
|
568
|
+
datasets_subquery.c.name,
|
|
569
|
+
self._datasets_versions.c.version,
|
|
570
|
+
self._datasets_versions.c.created_at,
|
|
571
|
+
dependency_tree_cte.c.source_dataset_id,
|
|
572
|
+
dependency_tree_cte.c.source_dataset_version_id,
|
|
573
|
+
dependency_tree_cte.c.depth,
|
|
574
|
+
]
|
|
575
|
+
|
|
538
576
|
#
|
|
539
577
|
# Jobs
|
|
540
578
|
#
|
|
@@ -542,6 +580,15 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
542
580
|
def _jobs_insert(self) -> "Insert":
|
|
543
581
|
return sqlite.insert(self._jobs)
|
|
544
582
|
|
|
583
|
+
#
|
|
584
|
+
# Checkpoints
|
|
585
|
+
#
|
|
586
|
+
def _checkpoints_insert(self) -> "Insert":
|
|
587
|
+
return sqlite.insert(self._checkpoints)
|
|
588
|
+
|
|
589
|
+
def _dataset_version_jobs_insert(self) -> "Insert":
|
|
590
|
+
return sqlite.insert(self._dataset_version_jobs)
|
|
591
|
+
|
|
545
592
|
#
|
|
546
593
|
# Namespaces
|
|
547
594
|
#
|
|
@@ -565,15 +612,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
565
612
|
This is currently used for the local cli.
|
|
566
613
|
"""
|
|
567
614
|
|
|
568
|
-
db:
|
|
615
|
+
db: SQLiteDatabaseEngine
|
|
569
616
|
|
|
570
617
|
# Cache for our defined column types to dialect specific TypeEngine relations
|
|
571
618
|
_col_python_type: ClassVar[dict[type, "TypeEngine"]] = {}
|
|
572
619
|
|
|
573
620
|
def __init__(
|
|
574
621
|
self,
|
|
575
|
-
db:
|
|
576
|
-
db_file:
|
|
622
|
+
db: SQLiteDatabaseEngine | None = None,
|
|
623
|
+
db_file: str | None = None,
|
|
577
624
|
in_memory: bool = False,
|
|
578
625
|
):
|
|
579
626
|
self.schema: DefaultSchema = DefaultSchema()
|
|
@@ -601,6 +648,10 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
601
648
|
{"db_clone_params": self.db.clone_params()},
|
|
602
649
|
)
|
|
603
650
|
|
|
651
|
+
@classmethod
|
|
652
|
+
def serialize_callable_name(cls) -> str:
|
|
653
|
+
return "sqlite.warehouse.init_after_clone"
|
|
654
|
+
|
|
604
655
|
@classmethod
|
|
605
656
|
def init_after_clone(
|
|
606
657
|
cls,
|
|
@@ -624,7 +675,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
624
675
|
only=filter_tables,
|
|
625
676
|
)
|
|
626
677
|
|
|
627
|
-
def is_ready(self, timeout:
|
|
678
|
+
def is_ready(self, timeout: int | None = None) -> bool:
|
|
628
679
|
return True
|
|
629
680
|
|
|
630
681
|
def create_dataset_rows_table(
|
|
@@ -654,77 +705,24 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
654
705
|
for row in self.db.execute(query, cursor=cur)
|
|
655
706
|
]
|
|
656
707
|
|
|
657
|
-
def merge_dataset_rows(
|
|
658
|
-
self,
|
|
659
|
-
src: DatasetRecord,
|
|
660
|
-
dst: DatasetRecord,
|
|
661
|
-
src_version: str,
|
|
662
|
-
dst_version: str,
|
|
663
|
-
) -> None:
|
|
664
|
-
dst_empty = False
|
|
665
|
-
|
|
666
|
-
if not self.db.has_table(self.dataset_table_name(src, src_version)):
|
|
667
|
-
# source table doesn't exist, nothing to do
|
|
668
|
-
return
|
|
669
|
-
|
|
670
|
-
src_dr = self.dataset_rows(src, src_version).table
|
|
671
|
-
|
|
672
|
-
if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
|
|
673
|
-
# destination table doesn't exist, create it
|
|
674
|
-
self.create_dataset_rows_table(
|
|
675
|
-
self.dataset_table_name(dst, dst_version),
|
|
676
|
-
columns=src_dr.columns,
|
|
677
|
-
)
|
|
678
|
-
dst_empty = True
|
|
679
|
-
|
|
680
|
-
dst_dr = self.dataset_rows(dst, dst_version).table
|
|
681
|
-
merge_fields = [c.name for c in src_dr.columns if c.name != "sys__id"]
|
|
682
|
-
select_src = select(*(getattr(src_dr.columns, f) for f in merge_fields))
|
|
683
|
-
|
|
684
|
-
if dst_empty:
|
|
685
|
-
# we don't need union, but just select from source to destination
|
|
686
|
-
insert_query = sqlite.insert(dst_dr).from_select(merge_fields, select_src)
|
|
687
|
-
else:
|
|
688
|
-
dst_version_latest = None
|
|
689
|
-
# find the previous version of the destination dataset
|
|
690
|
-
dst_previous_versions = [
|
|
691
|
-
v.version
|
|
692
|
-
for v in dst.versions # type: ignore [union-attr]
|
|
693
|
-
if semver.compare(v.version, dst_version) == -1
|
|
694
|
-
]
|
|
695
|
-
if dst_previous_versions:
|
|
696
|
-
dst_version_latest = max(dst_previous_versions)
|
|
697
|
-
|
|
698
|
-
dst_dr_latest = self.dataset_rows(dst, dst_version_latest).table
|
|
699
|
-
|
|
700
|
-
select_dst_latest = select(
|
|
701
|
-
*(getattr(dst_dr_latest.c, f) for f in merge_fields)
|
|
702
|
-
)
|
|
703
|
-
union_query = sqlalchemy.union(select_src, select_dst_latest)
|
|
704
|
-
insert_query = (
|
|
705
|
-
sqlite.insert(dst_dr)
|
|
706
|
-
.from_select(merge_fields, union_query)
|
|
707
|
-
.prefix_with("OR IGNORE")
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
self.db.execute(insert_query)
|
|
711
|
-
|
|
712
708
|
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
713
709
|
return (e.model_dump() for e in entries)
|
|
714
710
|
|
|
715
|
-
def insert_rows(
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
711
|
+
def insert_rows(
|
|
712
|
+
self,
|
|
713
|
+
table: Table,
|
|
714
|
+
rows: Iterable[dict[str, Any]],
|
|
715
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
716
|
+
) -> None:
|
|
717
|
+
for row_chunk in batched(rows, batch_size):
|
|
718
|
+
with self.db.transaction() as conn:
|
|
719
|
+
# transactions speeds up inserts significantly as there is no separate
|
|
720
|
+
# transaction created for each insert row
|
|
721
|
+
self.db.executemany(
|
|
722
|
+
table.insert().values({f: bindparam(f) for f in row_chunk[0]}),
|
|
723
|
+
row_chunk,
|
|
724
|
+
conn=conn,
|
|
725
|
+
)
|
|
728
726
|
|
|
729
727
|
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
|
|
730
728
|
dr = self.dataset_rows(dataset, version)
|
|
@@ -768,7 +766,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
768
766
|
self,
|
|
769
767
|
table: Table,
|
|
770
768
|
query: Select,
|
|
771
|
-
progress_cb:
|
|
769
|
+
progress_cb: Callable[[int], None] | None = None,
|
|
772
770
|
) -> None:
|
|
773
771
|
col_id = (
|
|
774
772
|
query.selected_columns.sys__id
|
|
@@ -797,7 +795,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
797
795
|
.limit(None)
|
|
798
796
|
)
|
|
799
797
|
|
|
800
|
-
for batch in batched_it(ids,
|
|
798
|
+
for batch in batched_it(ids, INSERT_BATCH_SIZE):
|
|
801
799
|
batch_ids = [row[0] for row in batch]
|
|
802
800
|
select_q._where_criteria = (col_id.in_(batch_ids),)
|
|
803
801
|
q = table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
@@ -852,18 +850,20 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
852
850
|
if isinstance(c, BinaryExpression):
|
|
853
851
|
right_left_join = add_left_rows_filter(c)
|
|
854
852
|
|
|
855
|
-
|
|
856
|
-
return sqlalchemy.select(*
|
|
853
|
+
union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
|
|
854
|
+
return sqlalchemy.select(*union_cte.c).select_from(union_cte)
|
|
855
|
+
|
|
856
|
+
def _system_row_number_expr(self):
|
|
857
|
+
return func.row_number().over()
|
|
858
|
+
|
|
859
|
+
def _system_random_expr(self):
|
|
860
|
+
return self._system_row_number_expr() * 1103515245 + 12345
|
|
857
861
|
|
|
858
862
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
859
863
|
"""
|
|
860
864
|
Create a temporary table from a query for use in a UDF.
|
|
861
865
|
"""
|
|
862
|
-
columns = [
|
|
863
|
-
sqlalchemy.Column(c.name, c.type)
|
|
864
|
-
for c in query.selected_columns
|
|
865
|
-
if c.name != "sys__id"
|
|
866
|
-
]
|
|
866
|
+
columns = [sqlalchemy.Column(c.name, c.type) for c in query.selected_columns]
|
|
867
867
|
table = self.create_udf_table(columns)
|
|
868
868
|
|
|
869
869
|
with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar:
|