datachain 0.34.5__py3-none-any.whl → 0.34.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/catalog.py +75 -83
- datachain/catalog/loader.py +3 -3
- datachain/checkpoint.py +1 -2
- datachain/cli/__init__.py +2 -4
- datachain/cli/commands/datasets.py +13 -13
- datachain/cli/commands/ls.py +4 -4
- datachain/cli/commands/query.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +1 -2
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +11 -21
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +4 -4
- datachain/client/local.py +4 -4
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +5 -5
- datachain/data_storage/metastore.py +107 -107
- datachain/data_storage/schema.py +18 -24
- datachain/data_storage/sqlite.py +21 -28
- datachain/data_storage/warehouse.py +13 -13
- datachain/dataset.py +64 -70
- datachain/delta.py +21 -18
- datachain/diff/__init__.py +13 -13
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +45 -42
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +54 -81
- datachain/job.py +8 -8
- datachain/lib/arrow.py +17 -14
- datachain/lib/audio.py +6 -6
- datachain/lib/clip.py +5 -4
- datachain/lib/convert/python_to_sql.py +4 -22
- datachain/lib/convert/values_to_tuples.py +4 -9
- datachain/lib/data_model.py +20 -19
- datachain/lib/dataset_info.py +6 -6
- datachain/lib/dc/csv.py +10 -10
- datachain/lib/dc/database.py +28 -29
- datachain/lib/dc/datachain.py +98 -97
- datachain/lib/dc/datasets.py +22 -22
- datachain/lib/dc/hf.py +4 -4
- datachain/lib/dc/json.py +9 -10
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +5 -5
- datachain/lib/dc/storage.py +12 -12
- datachain/lib/dc/storage_pattern.py +2 -2
- datachain/lib/dc/utils.py +11 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +26 -26
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +1 -2
- datachain/lib/model_store.py +3 -3
- datachain/lib/namespaces.py +4 -6
- datachain/lib/projects.py +5 -9
- datachain/lib/pytorch.py +10 -10
- datachain/lib/settings.py +23 -23
- datachain/lib/signal_schema.py +52 -44
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +25 -17
- datachain/lib/udf_signature.py +11 -11
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +30 -35
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +4 -4
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +1 -7
- datachain/project.py +4 -4
- datachain/query/batch.py +7 -8
- datachain/query/dataset.py +80 -87
- datachain/query/dispatch.py +7 -7
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/schema.py +7 -6
- datachain/query/session.py +7 -7
- datachain/query/udf.py +8 -7
- datachain/query/utils.py +8 -6
- datachain/remote/studio.py +33 -39
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +6 -9
- datachain/studio.py +30 -30
- datachain/toolkit/split.py +1 -2
- datachain/utils.py +21 -21
- {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/METADATA +2 -3
- datachain-0.34.7.dist-info/RECORD +173 -0
- datachain-0.34.5.dist-info/RECORD +0 -173
- {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/WHEEL +0 -0
- {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -1,18 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import sqlite3
|
|
4
|
-
from collections.abc import Iterable, Sequence
|
|
4
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
5
5
|
from contextlib import contextmanager
|
|
6
6
|
from functools import cached_property, wraps
|
|
7
7
|
from time import sleep
|
|
8
|
-
from typing import
|
|
9
|
-
TYPE_CHECKING,
|
|
10
|
-
Any,
|
|
11
|
-
Callable,
|
|
12
|
-
ClassVar,
|
|
13
|
-
Optional,
|
|
14
|
-
Union,
|
|
15
|
-
)
|
|
8
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Union
|
|
16
9
|
|
|
17
10
|
import sqlalchemy
|
|
18
11
|
from sqlalchemy import (
|
|
@@ -105,8 +98,8 @@ def retry_sqlite_locks(func):
|
|
|
105
98
|
|
|
106
99
|
|
|
107
100
|
def get_db_file_in_memory(
|
|
108
|
-
db_file:
|
|
109
|
-
) ->
|
|
101
|
+
db_file: str | None = None, in_memory: bool = False
|
|
102
|
+
) -> str | None:
|
|
110
103
|
"""Get in-memory db_file and check that conflicting arguments are not provided."""
|
|
111
104
|
if in_memory:
|
|
112
105
|
if db_file and db_file != ":memory:":
|
|
@@ -119,7 +112,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
119
112
|
dialect = sqlite_dialect
|
|
120
113
|
|
|
121
114
|
db: sqlite3.Connection
|
|
122
|
-
db_file:
|
|
115
|
+
db_file: str | None
|
|
123
116
|
is_closed: bool
|
|
124
117
|
|
|
125
118
|
def __init__(
|
|
@@ -127,8 +120,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
127
120
|
engine: "Engine",
|
|
128
121
|
metadata: "MetaData",
|
|
129
122
|
db: sqlite3.Connection,
|
|
130
|
-
db_file:
|
|
131
|
-
max_variable_number:
|
|
123
|
+
db_file: str | None = None,
|
|
124
|
+
max_variable_number: int | None = 999,
|
|
132
125
|
):
|
|
133
126
|
self.engine = engine
|
|
134
127
|
self.metadata = metadata
|
|
@@ -138,12 +131,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
138
131
|
self.max_variable_number = max_variable_number
|
|
139
132
|
|
|
140
133
|
@classmethod
|
|
141
|
-
def from_db_file(cls, db_file:
|
|
134
|
+
def from_db_file(cls, db_file: str | None = None) -> "SQLiteDatabaseEngine":
|
|
142
135
|
return cls(*cls._connect(db_file=db_file))
|
|
143
136
|
|
|
144
137
|
@staticmethod
|
|
145
138
|
def _connect(
|
|
146
|
-
db_file:
|
|
139
|
+
db_file: str | None = None,
|
|
147
140
|
) -> tuple["Engine", "MetaData", sqlite3.Connection, str, int]:
|
|
148
141
|
try:
|
|
149
142
|
if db_file == ":memory:":
|
|
@@ -232,7 +225,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
232
225
|
def execute(
|
|
233
226
|
self,
|
|
234
227
|
query,
|
|
235
|
-
cursor:
|
|
228
|
+
cursor: sqlite3.Cursor | None = None,
|
|
236
229
|
conn=None,
|
|
237
230
|
) -> sqlite3.Cursor:
|
|
238
231
|
if self.is_closed:
|
|
@@ -251,7 +244,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
251
244
|
|
|
252
245
|
@retry_sqlite_locks
|
|
253
246
|
def executemany(
|
|
254
|
-
self, query, params, cursor:
|
|
247
|
+
self, query, params, cursor: sqlite3.Cursor | None = None, conn=None
|
|
255
248
|
) -> sqlite3.Cursor:
|
|
256
249
|
if cursor:
|
|
257
250
|
return cursor.executemany(self.compile(query).string, params)
|
|
@@ -351,13 +344,13 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
351
344
|
|
|
352
345
|
META_TABLE = "meta"
|
|
353
346
|
|
|
354
|
-
db:
|
|
347
|
+
db: SQLiteDatabaseEngine
|
|
355
348
|
|
|
356
349
|
def __init__(
|
|
357
350
|
self,
|
|
358
|
-
uri:
|
|
359
|
-
db:
|
|
360
|
-
db_file:
|
|
351
|
+
uri: StorageURI | None = None,
|
|
352
|
+
db: SQLiteDatabaseEngine | None = None,
|
|
353
|
+
db_file: str | None = None,
|
|
361
354
|
in_memory: bool = False,
|
|
362
355
|
):
|
|
363
356
|
uri = uri or StorageURI("")
|
|
@@ -384,7 +377,7 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
384
377
|
|
|
385
378
|
def clone(
|
|
386
379
|
self,
|
|
387
|
-
uri:
|
|
380
|
+
uri: StorageURI | None = None,
|
|
388
381
|
use_new_connection: bool = False,
|
|
389
382
|
) -> "SQLiteMetastore":
|
|
390
383
|
uri = uri or StorageURI("")
|
|
@@ -582,15 +575,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
582
575
|
This is currently used for the local cli.
|
|
583
576
|
"""
|
|
584
577
|
|
|
585
|
-
db:
|
|
578
|
+
db: SQLiteDatabaseEngine
|
|
586
579
|
|
|
587
580
|
# Cache for our defined column types to dialect specific TypeEngine relations
|
|
588
581
|
_col_python_type: ClassVar[dict[type, "TypeEngine"]] = {}
|
|
589
582
|
|
|
590
583
|
def __init__(
|
|
591
584
|
self,
|
|
592
|
-
db:
|
|
593
|
-
db_file:
|
|
585
|
+
db: SQLiteDatabaseEngine | None = None,
|
|
586
|
+
db_file: str | None = None,
|
|
594
587
|
in_memory: bool = False,
|
|
595
588
|
):
|
|
596
589
|
self.schema: DefaultSchema = DefaultSchema()
|
|
@@ -645,7 +638,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
645
638
|
only=filter_tables,
|
|
646
639
|
)
|
|
647
640
|
|
|
648
|
-
def is_ready(self, timeout:
|
|
641
|
+
def is_ready(self, timeout: int | None = None) -> bool:
|
|
649
642
|
return True
|
|
650
643
|
|
|
651
644
|
def create_dataset_rows_table(
|
|
@@ -791,7 +784,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
791
784
|
self,
|
|
792
785
|
table: Table,
|
|
793
786
|
query: Select,
|
|
794
|
-
progress_cb:
|
|
787
|
+
progress_cb: Callable[[int], None] | None = None,
|
|
795
788
|
) -> None:
|
|
796
789
|
col_id = (
|
|
797
790
|
query.selected_columns.sys__id
|
|
@@ -4,8 +4,8 @@ import posixpath
|
|
|
4
4
|
import random
|
|
5
5
|
import string
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from collections.abc import Generator, Iterable, Iterator, Sequence
|
|
8
|
-
from typing import TYPE_CHECKING, Any,
|
|
7
|
+
from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
9
9
|
from urllib.parse import urlparse
|
|
10
10
|
|
|
11
11
|
import attrs
|
|
@@ -174,12 +174,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
174
174
|
#
|
|
175
175
|
|
|
176
176
|
@abstractmethod
|
|
177
|
-
def is_ready(self, timeout:
|
|
177
|
+
def is_ready(self, timeout: int | None = None) -> bool: ...
|
|
178
178
|
|
|
179
179
|
def dataset_rows(
|
|
180
180
|
self,
|
|
181
181
|
dataset: DatasetRecord,
|
|
182
|
-
version:
|
|
182
|
+
version: str | None = None,
|
|
183
183
|
column: str = "file",
|
|
184
184
|
):
|
|
185
185
|
version = version or dataset.latest_version
|
|
@@ -424,7 +424,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
424
424
|
|
|
425
425
|
def dataset_stats(
|
|
426
426
|
self, dataset: DatasetRecord, version: str
|
|
427
|
-
) -> tuple[
|
|
427
|
+
) -> tuple[int | None, int | None]:
|
|
428
428
|
"""
|
|
429
429
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
430
430
|
"""
|
|
@@ -549,7 +549,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
549
549
|
dr = dataset_rows
|
|
550
550
|
columns = [c.name for c in query.selected_columns]
|
|
551
551
|
for row in self.db.execute(query):
|
|
552
|
-
d = dict(zip(columns, row))
|
|
552
|
+
d = dict(zip(columns, row, strict=False))
|
|
553
553
|
yield Node(**{dr.without_object(k): v for k, v in d.items()})
|
|
554
554
|
|
|
555
555
|
def get_dirs_by_parent_path(
|
|
@@ -786,7 +786,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
786
786
|
def size(
|
|
787
787
|
self,
|
|
788
788
|
dataset_rows: "DataTable",
|
|
789
|
-
node:
|
|
789
|
+
node: Node | dict[str, Any],
|
|
790
790
|
count_files: bool = False,
|
|
791
791
|
) -> tuple[int, int]:
|
|
792
792
|
"""
|
|
@@ -828,10 +828,10 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
828
828
|
self,
|
|
829
829
|
dataset_rows: "DataTable",
|
|
830
830
|
parent_path: str,
|
|
831
|
-
fields:
|
|
832
|
-
type:
|
|
831
|
+
fields: Sequence[str] | None = None,
|
|
832
|
+
type: str | None = None,
|
|
833
833
|
conds=None,
|
|
834
|
-
order_by:
|
|
834
|
+
order_by: str | list[str] | None = None,
|
|
835
835
|
include_subobjects: bool = True,
|
|
836
836
|
) -> sa.Select:
|
|
837
837
|
if not conds:
|
|
@@ -869,7 +869,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
869
869
|
self,
|
|
870
870
|
dataset_rows: "DataTable",
|
|
871
871
|
node: Node,
|
|
872
|
-
sort:
|
|
872
|
+
sort: list[str] | str | None = None,
|
|
873
873
|
include_subobjects: bool = True,
|
|
874
874
|
) -> Iterator[NodeWithPath]:
|
|
875
875
|
"""
|
|
@@ -927,7 +927,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
927
927
|
def create_udf_table(
|
|
928
928
|
self,
|
|
929
929
|
columns: Sequence["sa.Column"] = (),
|
|
930
|
-
name:
|
|
930
|
+
name: str | None = None,
|
|
931
931
|
) -> sa.Table:
|
|
932
932
|
"""
|
|
933
933
|
Create a temporary table for storing custom signals generated by a UDF.
|
|
@@ -948,7 +948,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
948
948
|
self,
|
|
949
949
|
table: sa.Table,
|
|
950
950
|
query: sa.Select,
|
|
951
|
-
progress_cb:
|
|
951
|
+
progress_cb: Callable[[int], None] | None = None,
|
|
952
952
|
) -> None:
|
|
953
953
|
"""
|
|
954
954
|
Copy the results of a query into a table.
|
datachain/dataset.py
CHANGED
|
@@ -3,13 +3,7 @@ import json
|
|
|
3
3
|
from dataclasses import dataclass, fields
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from functools import cached_property
|
|
6
|
-
from typing import
|
|
7
|
-
Any,
|
|
8
|
-
NewType,
|
|
9
|
-
Optional,
|
|
10
|
-
TypeVar,
|
|
11
|
-
Union,
|
|
12
|
-
)
|
|
6
|
+
from typing import Any, NewType, TypeVar
|
|
13
7
|
from urllib.parse import urlparse
|
|
14
8
|
|
|
15
9
|
from packaging.specifiers import SpecifierSet
|
|
@@ -43,7 +37,7 @@ DATASET_NAME_REPLACEMENT_CHAR = "_"
|
|
|
43
37
|
StorageURI = NewType("StorageURI", str)
|
|
44
38
|
|
|
45
39
|
|
|
46
|
-
def parse_dataset_uri(uri: str) -> tuple[str,
|
|
40
|
+
def parse_dataset_uri(uri: str) -> tuple[str, str | None]:
|
|
47
41
|
"""
|
|
48
42
|
Parse dataser uri to extract name and version out of it (if version is defined)
|
|
49
43
|
Example:
|
|
@@ -65,7 +59,7 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
|
|
|
65
59
|
|
|
66
60
|
|
|
67
61
|
def create_dataset_uri(
|
|
68
|
-
name: str, namespace: str, project: str, version:
|
|
62
|
+
name: str, namespace: str, project: str, version: str | None = None
|
|
69
63
|
) -> str:
|
|
70
64
|
"""
|
|
71
65
|
Creates a dataset uri based on namespace, project, dataset name and optionally
|
|
@@ -81,7 +75,7 @@ def create_dataset_uri(
|
|
|
81
75
|
return uri
|
|
82
76
|
|
|
83
77
|
|
|
84
|
-
def parse_dataset_name(name: str) -> tuple[
|
|
78
|
+
def parse_dataset_name(name: str) -> tuple[str | None, str | None, str]:
|
|
85
79
|
"""Parses dataset name and returns namespace, project and name"""
|
|
86
80
|
if not name:
|
|
87
81
|
raise InvalidDatasetNameError("Name must be defined to parse it")
|
|
@@ -111,7 +105,7 @@ class DatasetDependency:
|
|
|
111
105
|
name: str
|
|
112
106
|
version: str
|
|
113
107
|
created_at: datetime
|
|
114
|
-
dependencies: list[
|
|
108
|
+
dependencies: list["DatasetDependency | None"]
|
|
115
109
|
|
|
116
110
|
@property
|
|
117
111
|
def dataset_name(self) -> str:
|
|
@@ -131,12 +125,12 @@ class DatasetDependency:
|
|
|
131
125
|
namespace_name: str,
|
|
132
126
|
project_name: str,
|
|
133
127
|
id: int,
|
|
134
|
-
dataset_id:
|
|
135
|
-
dataset_version_id:
|
|
136
|
-
dataset_name:
|
|
137
|
-
dataset_version:
|
|
138
|
-
dataset_version_created_at:
|
|
139
|
-
) ->
|
|
128
|
+
dataset_id: int | None,
|
|
129
|
+
dataset_version_id: int | None,
|
|
130
|
+
dataset_name: str | None,
|
|
131
|
+
dataset_version: str | None,
|
|
132
|
+
dataset_version_created_at: datetime | None,
|
|
133
|
+
) -> "DatasetDependency | None":
|
|
140
134
|
from datachain.lib.listing import is_listing_dataset
|
|
141
135
|
|
|
142
136
|
if not dataset_id:
|
|
@@ -198,17 +192,17 @@ class DatasetVersion:
|
|
|
198
192
|
status: int
|
|
199
193
|
feature_schema: dict
|
|
200
194
|
created_at: datetime
|
|
201
|
-
finished_at:
|
|
195
|
+
finished_at: datetime | None
|
|
202
196
|
error_message: str
|
|
203
197
|
error_stack: str
|
|
204
198
|
script_output: str
|
|
205
|
-
schema: dict[str,
|
|
206
|
-
num_objects:
|
|
207
|
-
size:
|
|
208
|
-
_preview_data:
|
|
199
|
+
schema: dict[str, SQLType | type[SQLType]]
|
|
200
|
+
num_objects: int | None
|
|
201
|
+
size: int | None
|
|
202
|
+
_preview_data: str | list[dict] | None
|
|
209
203
|
sources: str = ""
|
|
210
204
|
query_script: str = ""
|
|
211
|
-
job_id:
|
|
205
|
+
job_id: str | None = None
|
|
212
206
|
|
|
213
207
|
@classmethod
|
|
214
208
|
def parse( # noqa: PLR0913
|
|
@@ -218,19 +212,19 @@ class DatasetVersion:
|
|
|
218
212
|
dataset_id: int,
|
|
219
213
|
version: str,
|
|
220
214
|
status: int,
|
|
221
|
-
feature_schema:
|
|
215
|
+
feature_schema: str | None,
|
|
222
216
|
created_at: datetime,
|
|
223
|
-
finished_at:
|
|
217
|
+
finished_at: datetime | None,
|
|
224
218
|
error_message: str,
|
|
225
219
|
error_stack: str,
|
|
226
220
|
script_output: str,
|
|
227
|
-
num_objects:
|
|
228
|
-
size:
|
|
229
|
-
preview:
|
|
230
|
-
schema: dict[str,
|
|
221
|
+
num_objects: int | None,
|
|
222
|
+
size: int | None,
|
|
223
|
+
preview: str | list[dict] | None,
|
|
224
|
+
schema: dict[str, SQLType | type[SQLType]],
|
|
231
225
|
sources: str = "",
|
|
232
226
|
query_script: str = "",
|
|
233
|
-
job_id:
|
|
227
|
+
job_id: str | None = None,
|
|
234
228
|
):
|
|
235
229
|
return cls(
|
|
236
230
|
id,
|
|
@@ -292,7 +286,7 @@ class DatasetVersion:
|
|
|
292
286
|
}
|
|
293
287
|
|
|
294
288
|
@cached_property
|
|
295
|
-
def preview(self) ->
|
|
289
|
+
def preview(self) -> list[dict] | None:
|
|
296
290
|
if isinstance(self._preview_data, str):
|
|
297
291
|
return json.loads(self._preview_data)
|
|
298
292
|
return self._preview_data if self._preview_data else None
|
|
@@ -313,13 +307,13 @@ class DatasetListVersion:
|
|
|
313
307
|
version: str
|
|
314
308
|
status: int
|
|
315
309
|
created_at: datetime
|
|
316
|
-
finished_at:
|
|
310
|
+
finished_at: datetime | None
|
|
317
311
|
error_message: str
|
|
318
312
|
error_stack: str
|
|
319
|
-
num_objects:
|
|
320
|
-
size:
|
|
313
|
+
num_objects: int | None
|
|
314
|
+
size: int | None
|
|
321
315
|
query_script: str = ""
|
|
322
|
-
job_id:
|
|
316
|
+
job_id: str | None = None
|
|
323
317
|
|
|
324
318
|
@classmethod
|
|
325
319
|
def parse(
|
|
@@ -330,13 +324,13 @@ class DatasetListVersion:
|
|
|
330
324
|
version: str,
|
|
331
325
|
status: int,
|
|
332
326
|
created_at: datetime,
|
|
333
|
-
finished_at:
|
|
327
|
+
finished_at: datetime | None,
|
|
334
328
|
error_message: str,
|
|
335
329
|
error_stack: str,
|
|
336
|
-
num_objects:
|
|
337
|
-
size:
|
|
330
|
+
num_objects: int | None,
|
|
331
|
+
size: int | None,
|
|
338
332
|
query_script: str = "",
|
|
339
|
-
job_id:
|
|
333
|
+
job_id: str | None = None,
|
|
340
334
|
**kwargs,
|
|
341
335
|
):
|
|
342
336
|
return cls(
|
|
@@ -368,14 +362,14 @@ class DatasetRecord:
|
|
|
368
362
|
id: int
|
|
369
363
|
name: str
|
|
370
364
|
project: Project
|
|
371
|
-
description:
|
|
365
|
+
description: str | None
|
|
372
366
|
attrs: list[str]
|
|
373
|
-
schema: dict[str,
|
|
367
|
+
schema: dict[str, SQLType | type[SQLType]]
|
|
374
368
|
feature_schema: dict
|
|
375
369
|
versions: list[DatasetVersion]
|
|
376
370
|
status: int = DatasetStatus.CREATED
|
|
377
|
-
created_at:
|
|
378
|
-
finished_at:
|
|
371
|
+
created_at: datetime | None = None
|
|
372
|
+
finished_at: datetime | None = None
|
|
379
373
|
error_message: str = ""
|
|
380
374
|
error_stack: str = ""
|
|
381
375
|
script_output: str = ""
|
|
@@ -388,7 +382,7 @@ class DatasetRecord:
|
|
|
388
382
|
@staticmethod
|
|
389
383
|
def parse_schema(
|
|
390
384
|
ct: dict[str, Any],
|
|
391
|
-
) -> dict[str,
|
|
385
|
+
) -> dict[str, SQLType | type[SQLType]]:
|
|
392
386
|
return {
|
|
393
387
|
c_name: NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
|
|
394
388
|
for c_name, c_type in ct.items()
|
|
@@ -409,23 +403,23 @@ class DatasetRecord:
|
|
|
409
403
|
namespace_id: int,
|
|
410
404
|
namespace_uuid: str,
|
|
411
405
|
namespace_name: str,
|
|
412
|
-
namespace_description:
|
|
406
|
+
namespace_description: str | None,
|
|
413
407
|
namespace_created_at: datetime,
|
|
414
408
|
project_id: int,
|
|
415
409
|
project_uuid: str,
|
|
416
410
|
project_name: str,
|
|
417
|
-
project_description:
|
|
411
|
+
project_description: str | None,
|
|
418
412
|
project_created_at: datetime,
|
|
419
413
|
project_namespace_id: int,
|
|
420
414
|
dataset_id: int,
|
|
421
415
|
dataset_project_id: int,
|
|
422
416
|
name: str,
|
|
423
|
-
description:
|
|
417
|
+
description: str | None,
|
|
424
418
|
attrs: str,
|
|
425
419
|
status: int,
|
|
426
|
-
feature_schema:
|
|
420
|
+
feature_schema: str | None,
|
|
427
421
|
created_at: datetime,
|
|
428
|
-
finished_at:
|
|
422
|
+
finished_at: datetime | None,
|
|
429
423
|
error_message: str,
|
|
430
424
|
error_stack: str,
|
|
431
425
|
script_output: str,
|
|
@@ -437,19 +431,19 @@ class DatasetRecord:
|
|
|
437
431
|
version_dataset_id: int,
|
|
438
432
|
version: str,
|
|
439
433
|
version_status: int,
|
|
440
|
-
version_feature_schema:
|
|
434
|
+
version_feature_schema: str | None,
|
|
441
435
|
version_created_at: datetime,
|
|
442
|
-
version_finished_at:
|
|
436
|
+
version_finished_at: datetime | None,
|
|
443
437
|
version_error_message: str,
|
|
444
438
|
version_error_stack: str,
|
|
445
439
|
version_script_output: str,
|
|
446
|
-
version_num_objects:
|
|
447
|
-
version_size:
|
|
448
|
-
version_preview:
|
|
449
|
-
version_sources:
|
|
450
|
-
version_query_script:
|
|
440
|
+
version_num_objects: int | None,
|
|
441
|
+
version_size: int | None,
|
|
442
|
+
version_preview: str | None,
|
|
443
|
+
version_sources: str | None,
|
|
444
|
+
version_query_script: str | None,
|
|
451
445
|
version_schema: str,
|
|
452
|
-
version_job_id:
|
|
446
|
+
version_job_id: str | None = None,
|
|
453
447
|
) -> "DatasetRecord":
|
|
454
448
|
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
455
449
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
@@ -527,7 +521,7 @@ class DatasetRecord:
|
|
|
527
521
|
def full_name(self) -> str:
|
|
528
522
|
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
529
523
|
|
|
530
|
-
def get_schema(self, version: str) -> dict[str,
|
|
524
|
+
def get_schema(self, version: str) -> dict[str, SQLType | type[SQLType]]:
|
|
531
525
|
return self.get_version(version).schema if version else self.schema
|
|
532
526
|
|
|
533
527
|
def update(self, **kwargs):
|
|
@@ -649,7 +643,7 @@ class DatasetRecord:
|
|
|
649
643
|
"""Returns latest version of a dataset"""
|
|
650
644
|
return max(self.versions).version
|
|
651
645
|
|
|
652
|
-
def latest_major_version(self, major: int) ->
|
|
646
|
+
def latest_major_version(self, major: int) -> str | None:
|
|
653
647
|
"""
|
|
654
648
|
Returns latest specific major version, e.g if dataset has versions:
|
|
655
649
|
- 1.4.1
|
|
@@ -664,7 +658,7 @@ class DatasetRecord:
|
|
|
664
658
|
return None
|
|
665
659
|
return max(versions).version
|
|
666
660
|
|
|
667
|
-
def latest_compatible_version(self, version_spec: str) ->
|
|
661
|
+
def latest_compatible_version(self, version_spec: str) -> str | None:
|
|
668
662
|
"""
|
|
669
663
|
Returns the latest version that matches the given version specifier.
|
|
670
664
|
|
|
@@ -711,10 +705,10 @@ class DatasetListRecord:
|
|
|
711
705
|
id: int
|
|
712
706
|
name: str
|
|
713
707
|
project: Project
|
|
714
|
-
description:
|
|
708
|
+
description: str | None
|
|
715
709
|
attrs: list[str]
|
|
716
710
|
versions: list[DatasetListVersion]
|
|
717
|
-
created_at:
|
|
711
|
+
created_at: datetime | None = None
|
|
718
712
|
|
|
719
713
|
@classmethod
|
|
720
714
|
def parse( # noqa: PLR0913
|
|
@@ -722,17 +716,17 @@ class DatasetListRecord:
|
|
|
722
716
|
namespace_id: int,
|
|
723
717
|
namespace_uuid: str,
|
|
724
718
|
namespace_name: str,
|
|
725
|
-
namespace_description:
|
|
719
|
+
namespace_description: str | None,
|
|
726
720
|
namespace_created_at: datetime,
|
|
727
721
|
project_id: int,
|
|
728
722
|
project_uuid: str,
|
|
729
723
|
project_name: str,
|
|
730
|
-
project_description:
|
|
724
|
+
project_description: str | None,
|
|
731
725
|
project_created_at: datetime,
|
|
732
726
|
project_namespace_id: int,
|
|
733
727
|
dataset_id: int,
|
|
734
728
|
name: str,
|
|
735
|
-
description:
|
|
729
|
+
description: str | None,
|
|
736
730
|
attrs: str,
|
|
737
731
|
created_at: datetime,
|
|
738
732
|
version_id: int,
|
|
@@ -741,13 +735,13 @@ class DatasetListRecord:
|
|
|
741
735
|
version: str,
|
|
742
736
|
version_status: int,
|
|
743
737
|
version_created_at: datetime,
|
|
744
|
-
version_finished_at:
|
|
738
|
+
version_finished_at: datetime | None,
|
|
745
739
|
version_error_message: str,
|
|
746
740
|
version_error_stack: str,
|
|
747
|
-
version_num_objects:
|
|
748
|
-
version_size:
|
|
749
|
-
version_query_script:
|
|
750
|
-
version_job_id:
|
|
741
|
+
version_num_objects: int | None,
|
|
742
|
+
version_size: int | None,
|
|
743
|
+
version_query_script: str | None,
|
|
744
|
+
version_job_id: str | None = None,
|
|
751
745
|
) -> "DatasetListRecord":
|
|
752
746
|
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
753
747
|
|
datachain/delta.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from copy import copy
|
|
3
3
|
from functools import wraps
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
4
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
5
5
|
|
|
6
6
|
import datachain
|
|
7
7
|
from datachain.dataset import DatasetDependency, DatasetRecord
|
|
@@ -9,7 +9,10 @@ from datachain.error import DatasetNotFoundError
|
|
|
9
9
|
from datachain.project import Project
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
-
from
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from typing import Concatenate
|
|
14
|
+
|
|
15
|
+
from typing_extensions import ParamSpec
|
|
13
16
|
|
|
14
17
|
from datachain.lib.dc import DataChain
|
|
15
18
|
|
|
@@ -55,8 +58,8 @@ def _get_delta_chain(
|
|
|
55
58
|
source_ds_project: Project,
|
|
56
59
|
source_ds_version: str,
|
|
57
60
|
source_ds_latest_version: str,
|
|
58
|
-
on:
|
|
59
|
-
compare:
|
|
61
|
+
on: str | Sequence[str],
|
|
62
|
+
compare: str | Sequence[str] | None = None,
|
|
60
63
|
) -> "DataChain":
|
|
61
64
|
"""Get delta chain for processing changes between versions."""
|
|
62
65
|
source_dc = datachain.read_dataset(
|
|
@@ -84,11 +87,11 @@ def _get_retry_chain(
|
|
|
84
87
|
source_ds_name: str,
|
|
85
88
|
source_ds_project: Project,
|
|
86
89
|
source_ds_version: str,
|
|
87
|
-
on:
|
|
88
|
-
right_on:
|
|
89
|
-
delta_retry:
|
|
90
|
+
on: str | Sequence[str],
|
|
91
|
+
right_on: str | Sequence[str] | None,
|
|
92
|
+
delta_retry: bool | str | None,
|
|
90
93
|
diff_chain: "DataChain",
|
|
91
|
-
) ->
|
|
94
|
+
) -> "DataChain | None":
|
|
92
95
|
"""Get retry chain for processing error records and missing records."""
|
|
93
96
|
# Import here to avoid circular import
|
|
94
97
|
from datachain.lib.dc import C
|
|
@@ -144,11 +147,11 @@ def _get_source_info(
|
|
|
144
147
|
latest_version: str,
|
|
145
148
|
catalog,
|
|
146
149
|
) -> tuple[
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
150
|
+
str | None,
|
|
151
|
+
Project | None,
|
|
152
|
+
str | None,
|
|
153
|
+
str | None,
|
|
154
|
+
list[DatasetDependency] | None,
|
|
152
155
|
]:
|
|
153
156
|
"""Get source dataset information and dependencies.
|
|
154
157
|
|
|
@@ -190,11 +193,11 @@ def delta_retry_update(
|
|
|
190
193
|
namespace_name: str,
|
|
191
194
|
project_name: str,
|
|
192
195
|
name: str,
|
|
193
|
-
on:
|
|
194
|
-
right_on:
|
|
195
|
-
compare:
|
|
196
|
-
delta_retry:
|
|
197
|
-
) -> tuple[
|
|
196
|
+
on: str | Sequence[str],
|
|
197
|
+
right_on: str | Sequence[str] | None = None,
|
|
198
|
+
compare: str | Sequence[str] | None = None,
|
|
199
|
+
delta_retry: bool | str | None = None,
|
|
200
|
+
) -> tuple["DataChain | None", list[DatasetDependency] | None, bool]:
|
|
198
201
|
"""
|
|
199
202
|
Creates new chain that consists of the last version of current delta dataset
|
|
200
203
|
plus diff from the source with all needed modifications.
|