datachain 0.16.5__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +20 -91
- datachain/cli/commands/datasets.py +1 -1
- datachain/cli/commands/show.py +1 -1
- datachain/cli/parser/__init__.py +2 -2
- datachain/cli/parser/job.py +30 -0
- datachain/data_storage/metastore.py +23 -23
- datachain/data_storage/sqlite.py +8 -7
- datachain/data_storage/warehouse.py +12 -12
- datachain/dataset.py +88 -45
- datachain/lib/dataset_info.py +2 -1
- datachain/lib/dc/datachain.py +8 -3
- datachain/lib/dc/datasets.py +28 -7
- datachain/lib/dc/storage.py +10 -2
- datachain/lib/dc/values.py +2 -0
- datachain/lib/pytorch.py +2 -2
- datachain/listing.py +1 -1
- datachain/query/dataset.py +9 -9
- datachain/query/dispatch.py +8 -6
- datachain/query/session.py +2 -2
- datachain/remote/studio.py +16 -5
- datachain/semver.py +58 -0
- datachain/studio.py +34 -3
- {datachain-0.16.5.dist-info → datachain-0.17.1.dist-info}/METADATA +2 -2
- {datachain-0.16.5.dist-info → datachain-0.17.1.dist-info}/RECORD +28 -27
- {datachain-0.16.5.dist-info → datachain-0.17.1.dist-info}/WHEEL +1 -1
- {datachain-0.16.5.dist-info → datachain-0.17.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.16.5.dist-info → datachain-0.17.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.16.5.dist-info → datachain-0.17.1.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -33,6 +33,7 @@ from datachain.cache import Cache
|
|
|
33
33
|
from datachain.client import Client
|
|
34
34
|
from datachain.dataset import (
|
|
35
35
|
DATASET_PREFIX,
|
|
36
|
+
DEFAULT_DATASET_VERSION,
|
|
36
37
|
QUERY_DATASET_PREFIX,
|
|
37
38
|
DatasetDependency,
|
|
38
39
|
DatasetListRecord,
|
|
@@ -154,9 +155,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
154
155
|
metastore: "AbstractMetastore",
|
|
155
156
|
warehouse: "AbstractWarehouse",
|
|
156
157
|
remote_ds_name: str,
|
|
157
|
-
remote_ds_version:
|
|
158
|
+
remote_ds_version: str,
|
|
158
159
|
local_ds_name: str,
|
|
159
|
-
local_ds_version:
|
|
160
|
+
local_ds_version: str,
|
|
160
161
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
161
162
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
162
163
|
progress_bar=None,
|
|
@@ -286,7 +287,7 @@ class NodeGroup:
|
|
|
286
287
|
# (not including the bucket name or s3:// prefix)
|
|
287
288
|
source_path: str = ""
|
|
288
289
|
dataset_name: Optional[str] = None
|
|
289
|
-
dataset_version: Optional[
|
|
290
|
+
dataset_version: Optional[str] = None
|
|
290
291
|
instantiated_nodes: Optional[list[NodeWithPath]] = None
|
|
291
292
|
|
|
292
293
|
@property
|
|
@@ -607,7 +608,7 @@ class Catalog:
|
|
|
607
608
|
return lst, client, list_path
|
|
608
609
|
|
|
609
610
|
def _remove_dataset_rows_and_warehouse_info(
|
|
610
|
-
self, dataset: DatasetRecord, version:
|
|
611
|
+
self, dataset: DatasetRecord, version: str, **kwargs
|
|
611
612
|
):
|
|
612
613
|
self.warehouse.drop_dataset_rows_table(dataset, version)
|
|
613
614
|
self.update_dataset_version_with_warehouse_info(
|
|
@@ -767,7 +768,7 @@ class Catalog:
|
|
|
767
768
|
def create_dataset(
|
|
768
769
|
self,
|
|
769
770
|
name: str,
|
|
770
|
-
version: Optional[
|
|
771
|
+
version: Optional[str] = None,
|
|
771
772
|
*,
|
|
772
773
|
columns: Sequence[Column],
|
|
773
774
|
feature_schema: Optional[dict] = None,
|
|
@@ -783,18 +784,17 @@ class Catalog:
|
|
|
783
784
|
Creates new dataset of a specific version.
|
|
784
785
|
If dataset is not yet created, it will create it with version 1
|
|
785
786
|
If version is None, then next unused version is created.
|
|
786
|
-
If version is given, then it must be an unused version
|
|
787
|
+
If version is given, then it must be an unused version.
|
|
787
788
|
"""
|
|
788
789
|
assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
|
|
789
790
|
if not listing and Client.is_data_source_uri(name):
|
|
790
791
|
raise RuntimeError(
|
|
791
792
|
"Cannot create dataset that starts with source prefix, e.g s3://"
|
|
792
793
|
)
|
|
793
|
-
default_version =
|
|
794
|
+
default_version = DEFAULT_DATASET_VERSION
|
|
794
795
|
try:
|
|
795
796
|
dataset = self.get_dataset(name)
|
|
796
|
-
default_version = dataset.
|
|
797
|
-
|
|
797
|
+
default_version = dataset.next_version_patch
|
|
798
798
|
if (description or attrs) and (
|
|
799
799
|
dataset.description != description or dataset.attrs != attrs
|
|
800
800
|
):
|
|
@@ -846,7 +846,7 @@ class Catalog:
|
|
|
846
846
|
def create_new_dataset_version(
|
|
847
847
|
self,
|
|
848
848
|
dataset: DatasetRecord,
|
|
849
|
-
version:
|
|
849
|
+
version: str,
|
|
850
850
|
*,
|
|
851
851
|
columns: Sequence[Column],
|
|
852
852
|
sources="",
|
|
@@ -892,7 +892,7 @@ class Catalog:
|
|
|
892
892
|
return dataset
|
|
893
893
|
|
|
894
894
|
def update_dataset_version_with_warehouse_info(
|
|
895
|
-
self, dataset: DatasetRecord, version:
|
|
895
|
+
self, dataset: DatasetRecord, version: str, rows_dropped=False, **kwargs
|
|
896
896
|
) -> None:
|
|
897
897
|
from datachain.query.dataset import DatasetQuery
|
|
898
898
|
|
|
@@ -959,7 +959,7 @@ class Catalog:
|
|
|
959
959
|
return dataset
|
|
960
960
|
|
|
961
961
|
def remove_dataset_version(
|
|
962
|
-
self, dataset: DatasetRecord, version:
|
|
962
|
+
self, dataset: DatasetRecord, version: str, drop_rows: Optional[bool] = True
|
|
963
963
|
) -> None:
|
|
964
964
|
"""
|
|
965
965
|
Deletes one single dataset version.
|
|
@@ -1037,82 +1037,11 @@ class Catalog:
|
|
|
1037
1037
|
|
|
1038
1038
|
return self.get_dataset(name)
|
|
1039
1039
|
|
|
1040
|
-
def register_dataset(
|
|
1041
|
-
self,
|
|
1042
|
-
dataset: DatasetRecord,
|
|
1043
|
-
version: int,
|
|
1044
|
-
target_dataset: DatasetRecord,
|
|
1045
|
-
target_version: Optional[int] = None,
|
|
1046
|
-
) -> DatasetRecord:
|
|
1047
|
-
"""
|
|
1048
|
-
Registers dataset version of one dataset as dataset version of another
|
|
1049
|
-
one (it can be new version of existing one).
|
|
1050
|
-
It also removes original dataset version
|
|
1051
|
-
"""
|
|
1052
|
-
target_version = target_version or target_dataset.next_version
|
|
1053
|
-
|
|
1054
|
-
if not target_dataset.is_valid_next_version(target_version):
|
|
1055
|
-
raise DatasetInvalidVersionError(
|
|
1056
|
-
f"Version {target_version} must be higher than the current latest one"
|
|
1057
|
-
)
|
|
1058
|
-
|
|
1059
|
-
dataset_version = dataset.get_version(version)
|
|
1060
|
-
if not dataset_version:
|
|
1061
|
-
raise DatasetVersionNotFoundError(
|
|
1062
|
-
f"Dataset {dataset.name} does not have version {version}"
|
|
1063
|
-
)
|
|
1064
|
-
|
|
1065
|
-
if not dataset_version.is_final_status():
|
|
1066
|
-
raise ValueError("Cannot register dataset version in non final status")
|
|
1067
|
-
|
|
1068
|
-
# copy dataset version
|
|
1069
|
-
target_dataset = self.metastore.create_dataset_version(
|
|
1070
|
-
target_dataset,
|
|
1071
|
-
target_version,
|
|
1072
|
-
sources=dataset_version.sources,
|
|
1073
|
-
status=dataset_version.status,
|
|
1074
|
-
query_script=dataset_version.query_script,
|
|
1075
|
-
error_message=dataset_version.error_message,
|
|
1076
|
-
error_stack=dataset_version.error_stack,
|
|
1077
|
-
script_output=dataset_version.script_output,
|
|
1078
|
-
created_at=dataset_version.created_at,
|
|
1079
|
-
finished_at=dataset_version.finished_at,
|
|
1080
|
-
schema=dataset_version.serialized_schema,
|
|
1081
|
-
num_objects=dataset_version.num_objects,
|
|
1082
|
-
size=dataset_version.size,
|
|
1083
|
-
preview=dataset_version.preview,
|
|
1084
|
-
job_id=dataset_version.job_id,
|
|
1085
|
-
)
|
|
1086
|
-
|
|
1087
|
-
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1088
|
-
# of target dataset
|
|
1089
|
-
self.warehouse.rename_dataset_table(
|
|
1090
|
-
dataset.name,
|
|
1091
|
-
target_dataset.name,
|
|
1092
|
-
old_version=version,
|
|
1093
|
-
new_version=target_version,
|
|
1094
|
-
)
|
|
1095
|
-
self.metastore.update_dataset_dependency_source(
|
|
1096
|
-
dataset,
|
|
1097
|
-
version,
|
|
1098
|
-
new_source_dataset=target_dataset,
|
|
1099
|
-
new_source_dataset_version=target_version,
|
|
1100
|
-
)
|
|
1101
|
-
|
|
1102
|
-
if dataset.id == target_dataset.id:
|
|
1103
|
-
# we are updating the same dataset so we need to refresh it to have newly
|
|
1104
|
-
# added version in step before
|
|
1105
|
-
dataset = self.get_dataset(dataset.name)
|
|
1106
|
-
|
|
1107
|
-
self.remove_dataset_version(dataset, version, drop_rows=False)
|
|
1108
|
-
|
|
1109
|
-
return self.get_dataset(target_dataset.name)
|
|
1110
|
-
|
|
1111
1040
|
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1112
1041
|
return self.metastore.get_dataset(name)
|
|
1113
1042
|
|
|
1114
1043
|
def get_dataset_with_remote_fallback(
|
|
1115
|
-
self, name: str, version: Optional[
|
|
1044
|
+
self, name: str, version: Optional[str] = None
|
|
1116
1045
|
) -> DatasetRecord:
|
|
1117
1046
|
try:
|
|
1118
1047
|
ds = self.get_dataset(name)
|
|
@@ -1157,7 +1086,7 @@ class Catalog:
|
|
|
1157
1086
|
return DatasetRecord.from_dict(dataset_info)
|
|
1158
1087
|
|
|
1159
1088
|
def get_dataset_dependencies(
|
|
1160
|
-
self, name: str, version:
|
|
1089
|
+
self, name: str, version: str, indirect=False
|
|
1161
1090
|
) -> list[Optional[DatasetDependency]]:
|
|
1162
1091
|
dataset = self.get_dataset(name)
|
|
1163
1092
|
|
|
@@ -1175,7 +1104,7 @@ class Catalog:
|
|
|
1175
1104
|
if d.is_dataset:
|
|
1176
1105
|
# only datasets can have dependencies
|
|
1177
1106
|
d.dependencies = self.get_dataset_dependencies(
|
|
1178
|
-
d.name,
|
|
1107
|
+
d.name, d.version, indirect=indirect
|
|
1179
1108
|
)
|
|
1180
1109
|
|
|
1181
1110
|
return direct_dependencies
|
|
@@ -1244,7 +1173,7 @@ class Catalog:
|
|
|
1244
1173
|
]
|
|
1245
1174
|
|
|
1246
1175
|
def ls_dataset_rows(
|
|
1247
|
-
self, name: str, version:
|
|
1176
|
+
self, name: str, version: str, offset=None, limit=None
|
|
1248
1177
|
) -> list[dict]:
|
|
1249
1178
|
from datachain.query.dataset import DatasetQuery
|
|
1250
1179
|
|
|
@@ -1282,7 +1211,7 @@ class Catalog:
|
|
|
1282
1211
|
self,
|
|
1283
1212
|
bucket_uri: str,
|
|
1284
1213
|
name: str,
|
|
1285
|
-
version:
|
|
1214
|
+
version: str,
|
|
1286
1215
|
client_config=None,
|
|
1287
1216
|
) -> list[str]:
|
|
1288
1217
|
dataset = self.get_dataset(name)
|
|
@@ -1291,14 +1220,14 @@ class Catalog:
|
|
|
1291
1220
|
bucket_uri, dataset, version, client_config
|
|
1292
1221
|
)
|
|
1293
1222
|
|
|
1294
|
-
def dataset_table_export_file_names(self, name: str, version:
|
|
1223
|
+
def dataset_table_export_file_names(self, name: str, version: str) -> list[str]:
|
|
1295
1224
|
dataset = self.get_dataset(name)
|
|
1296
1225
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1297
1226
|
|
|
1298
1227
|
def remove_dataset(
|
|
1299
1228
|
self,
|
|
1300
1229
|
name: str,
|
|
1301
|
-
version: Optional[
|
|
1230
|
+
version: Optional[str] = None,
|
|
1302
1231
|
force: Optional[bool] = False,
|
|
1303
1232
|
studio: Optional[bool] = False,
|
|
1304
1233
|
):
|
|
@@ -1372,7 +1301,7 @@ class Catalog:
|
|
|
1372
1301
|
remote_ds_uri: str,
|
|
1373
1302
|
output: Optional[str] = None,
|
|
1374
1303
|
local_ds_name: Optional[str] = None,
|
|
1375
|
-
local_ds_version: Optional[
|
|
1304
|
+
local_ds_version: Optional[str] = None,
|
|
1376
1305
|
cp: bool = False,
|
|
1377
1306
|
force: bool = False,
|
|
1378
1307
|
*,
|
|
@@ -127,7 +127,7 @@ def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
|
127
127
|
def rm_dataset(
|
|
128
128
|
catalog: "Catalog",
|
|
129
129
|
name: str,
|
|
130
|
-
version: Optional[
|
|
130
|
+
version: Optional[str] = None,
|
|
131
131
|
force: Optional[bool] = False,
|
|
132
132
|
studio: bool = False,
|
|
133
133
|
local: bool = False,
|
datachain/cli/commands/show.py
CHANGED
datachain/cli/parser/__init__.py
CHANGED
|
@@ -302,7 +302,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
302
302
|
"--version",
|
|
303
303
|
action="store",
|
|
304
304
|
default=None,
|
|
305
|
-
type=
|
|
305
|
+
type=str,
|
|
306
306
|
help="Dataset version",
|
|
307
307
|
)
|
|
308
308
|
rm_dataset_parser.add_argument(
|
|
@@ -495,7 +495,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
495
495
|
"--version",
|
|
496
496
|
action="store",
|
|
497
497
|
default=None,
|
|
498
|
-
type=
|
|
498
|
+
type=str,
|
|
499
499
|
help="Dataset version",
|
|
500
500
|
)
|
|
501
501
|
show_parser.add_argument("--schema", action="store_true", help="Show schema")
|
datachain/cli/parser/job.py
CHANGED
|
@@ -83,6 +83,36 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
83
83
|
help="Python package requirements",
|
|
84
84
|
)
|
|
85
85
|
|
|
86
|
+
studio_ls_help = "List jobs in Studio"
|
|
87
|
+
studio_ls_description = "List jobs in Studio."
|
|
88
|
+
|
|
89
|
+
studio_ls_parser = jobs_subparser.add_parser(
|
|
90
|
+
"ls",
|
|
91
|
+
parents=[parent_parser],
|
|
92
|
+
description=studio_ls_description,
|
|
93
|
+
help=studio_ls_help,
|
|
94
|
+
formatter_class=CustomHelpFormatter,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
studio_ls_parser.add_argument(
|
|
98
|
+
"--status",
|
|
99
|
+
action="store",
|
|
100
|
+
help="Status to filter jobs by",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
studio_ls_parser.add_argument(
|
|
104
|
+
"--team",
|
|
105
|
+
action="store",
|
|
106
|
+
default=None,
|
|
107
|
+
help="Team to list jobs for (default: from config)",
|
|
108
|
+
)
|
|
109
|
+
studio_ls_parser.add_argument(
|
|
110
|
+
"--limit",
|
|
111
|
+
type=int,
|
|
112
|
+
default=20,
|
|
113
|
+
help="Limit the number of jobs returned (default: 20)",
|
|
114
|
+
)
|
|
115
|
+
|
|
86
116
|
studio_cancel_help = "Cancel a job in Studio"
|
|
87
117
|
studio_cancel_description = "Cancel a running job in Studio."
|
|
88
118
|
|
|
@@ -128,7 +128,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
128
128
|
def create_dataset_version( # noqa: PLR0913
|
|
129
129
|
self,
|
|
130
130
|
dataset: DatasetRecord,
|
|
131
|
-
version:
|
|
131
|
+
version: str,
|
|
132
132
|
status: int,
|
|
133
133
|
sources: str = "",
|
|
134
134
|
feature_schema: Optional[dict] = None,
|
|
@@ -158,13 +158,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
158
158
|
|
|
159
159
|
@abstractmethod
|
|
160
160
|
def update_dataset_version(
|
|
161
|
-
self, dataset: DatasetRecord, version:
|
|
161
|
+
self, dataset: DatasetRecord, version: str, **kwargs
|
|
162
162
|
) -> DatasetVersion:
|
|
163
163
|
"""Updates dataset version fields."""
|
|
164
164
|
|
|
165
165
|
@abstractmethod
|
|
166
166
|
def remove_dataset_version(
|
|
167
|
-
self, dataset: DatasetRecord, version:
|
|
167
|
+
self, dataset: DatasetRecord, version: str
|
|
168
168
|
) -> DatasetRecord:
|
|
169
169
|
"""
|
|
170
170
|
Deletes one single dataset version.
|
|
@@ -188,7 +188,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
188
188
|
self,
|
|
189
189
|
dataset: DatasetRecord,
|
|
190
190
|
status: int,
|
|
191
|
-
version: Optional[
|
|
191
|
+
version: Optional[str] = None,
|
|
192
192
|
error_message="",
|
|
193
193
|
error_stack="",
|
|
194
194
|
script_output="",
|
|
@@ -202,9 +202,9 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
202
202
|
def add_dataset_dependency(
|
|
203
203
|
self,
|
|
204
204
|
source_dataset_name: str,
|
|
205
|
-
source_dataset_version:
|
|
205
|
+
source_dataset_version: str,
|
|
206
206
|
dataset_name: str,
|
|
207
|
-
dataset_version:
|
|
207
|
+
dataset_version: str,
|
|
208
208
|
) -> None:
|
|
209
209
|
"""Adds dataset dependency to dataset."""
|
|
210
210
|
|
|
@@ -212,21 +212,21 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
212
212
|
def update_dataset_dependency_source(
|
|
213
213
|
self,
|
|
214
214
|
source_dataset: DatasetRecord,
|
|
215
|
-
source_dataset_version:
|
|
215
|
+
source_dataset_version: str,
|
|
216
216
|
new_source_dataset: Optional[DatasetRecord] = None,
|
|
217
|
-
new_source_dataset_version: Optional[
|
|
217
|
+
new_source_dataset_version: Optional[str] = None,
|
|
218
218
|
) -> None:
|
|
219
219
|
"""Updates dataset dependency source."""
|
|
220
220
|
|
|
221
221
|
@abstractmethod
|
|
222
222
|
def get_direct_dataset_dependencies(
|
|
223
|
-
self, dataset: DatasetRecord, version:
|
|
223
|
+
self, dataset: DatasetRecord, version: str
|
|
224
224
|
) -> list[Optional[DatasetDependency]]:
|
|
225
225
|
"""Gets direct dataset dependencies."""
|
|
226
226
|
|
|
227
227
|
@abstractmethod
|
|
228
228
|
def remove_dataset_dependencies(
|
|
229
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
229
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
230
230
|
) -> None:
|
|
231
231
|
"""
|
|
232
232
|
When we remove dataset, we need to clean up it's dependencies as well.
|
|
@@ -234,7 +234,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
234
234
|
|
|
235
235
|
@abstractmethod
|
|
236
236
|
def remove_dataset_dependants(
|
|
237
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
237
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
238
238
|
) -> None:
|
|
239
239
|
"""
|
|
240
240
|
When we remove dataset, we need to clear its references in other dataset
|
|
@@ -370,7 +370,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
370
370
|
ForeignKey(f"{cls.DATASET_TABLE}.id", ondelete="CASCADE"),
|
|
371
371
|
nullable=False,
|
|
372
372
|
),
|
|
373
|
-
Column("version",
|
|
373
|
+
Column("version", Text, nullable=False, default="1.0.0"),
|
|
374
374
|
Column(
|
|
375
375
|
"status",
|
|
376
376
|
Integer,
|
|
@@ -554,7 +554,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
554
554
|
def create_dataset_version( # noqa: PLR0913
|
|
555
555
|
self,
|
|
556
556
|
dataset: DatasetRecord,
|
|
557
|
-
version:
|
|
557
|
+
version: str,
|
|
558
558
|
status: int,
|
|
559
559
|
sources: str = "",
|
|
560
560
|
feature_schema: Optional[dict] = None,
|
|
@@ -648,7 +648,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
648
648
|
return result_ds
|
|
649
649
|
|
|
650
650
|
def update_dataset_version(
|
|
651
|
-
self, dataset: DatasetRecord, version:
|
|
651
|
+
self, dataset: DatasetRecord, version: str, conn=None, **kwargs
|
|
652
652
|
) -> DatasetVersion:
|
|
653
653
|
"""Updates dataset fields."""
|
|
654
654
|
dataset_version = dataset.get_version(version)
|
|
@@ -758,7 +758,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
758
758
|
return ds
|
|
759
759
|
|
|
760
760
|
def remove_dataset_version(
|
|
761
|
-
self, dataset: DatasetRecord, version:
|
|
761
|
+
self, dataset: DatasetRecord, version: str
|
|
762
762
|
) -> DatasetRecord:
|
|
763
763
|
"""
|
|
764
764
|
Deletes one single dataset version.
|
|
@@ -791,7 +791,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
791
791
|
self,
|
|
792
792
|
dataset: DatasetRecord,
|
|
793
793
|
status: int,
|
|
794
|
-
version: Optional[
|
|
794
|
+
version: Optional[str] = None,
|
|
795
795
|
error_message="",
|
|
796
796
|
error_stack="",
|
|
797
797
|
script_output="",
|
|
@@ -825,9 +825,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
825
825
|
def add_dataset_dependency(
|
|
826
826
|
self,
|
|
827
827
|
source_dataset_name: str,
|
|
828
|
-
source_dataset_version:
|
|
828
|
+
source_dataset_version: str,
|
|
829
829
|
dataset_name: str,
|
|
830
|
-
dataset_version:
|
|
830
|
+
dataset_version: str,
|
|
831
831
|
) -> None:
|
|
832
832
|
"""Adds dataset dependency to dataset."""
|
|
833
833
|
source_dataset = self.get_dataset(source_dataset_name)
|
|
@@ -847,9 +847,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
847
847
|
def update_dataset_dependency_source(
|
|
848
848
|
self,
|
|
849
849
|
source_dataset: DatasetRecord,
|
|
850
|
-
source_dataset_version:
|
|
850
|
+
source_dataset_version: str,
|
|
851
851
|
new_source_dataset: Optional[DatasetRecord] = None,
|
|
852
|
-
new_source_dataset_version: Optional[
|
|
852
|
+
new_source_dataset_version: Optional[str] = None,
|
|
853
853
|
) -> None:
|
|
854
854
|
dd = self._datasets_dependencies
|
|
855
855
|
|
|
@@ -880,7 +880,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
880
880
|
"""
|
|
881
881
|
|
|
882
882
|
def get_direct_dataset_dependencies(
|
|
883
|
-
self, dataset: DatasetRecord, version:
|
|
883
|
+
self, dataset: DatasetRecord, version: str
|
|
884
884
|
) -> list[Optional[DatasetDependency]]:
|
|
885
885
|
d = self._datasets
|
|
886
886
|
dd = self._datasets_dependencies
|
|
@@ -909,7 +909,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
909
909
|
return [self.dependency_class.parse(*r) for r in self.db.execute(query)]
|
|
910
910
|
|
|
911
911
|
def remove_dataset_dependencies(
|
|
912
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
912
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
913
913
|
) -> None:
|
|
914
914
|
"""
|
|
915
915
|
When we remove dataset, we need to clean up it's dependencies as well
|
|
@@ -928,7 +928,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
928
928
|
self.db.execute(q)
|
|
929
929
|
|
|
930
930
|
def remove_dataset_dependants(
|
|
931
|
-
self, dataset: DatasetRecord, version: Optional[
|
|
931
|
+
self, dataset: DatasetRecord, version: Optional[str] = None
|
|
932
932
|
) -> None:
|
|
933
933
|
"""
|
|
934
934
|
When we remove dataset, we need to clear its references in other dataset
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -25,6 +25,7 @@ from sqlalchemy.sql.selectable import Select
|
|
|
25
25
|
from tqdm.auto import tqdm
|
|
26
26
|
|
|
27
27
|
import datachain.sql.sqlite
|
|
28
|
+
from datachain import semver
|
|
28
29
|
from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
29
30
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
30
31
|
from datachain.data_storage.schema import DefaultSchema
|
|
@@ -486,7 +487,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
486
487
|
return table
|
|
487
488
|
|
|
488
489
|
def get_dataset_sources(
|
|
489
|
-
self, dataset: DatasetRecord, version:
|
|
490
|
+
self, dataset: DatasetRecord, version: str
|
|
490
491
|
) -> list[StorageURI]:
|
|
491
492
|
dr = self.dataset_rows(dataset, version)
|
|
492
493
|
query = dr.select(dr.c("source", column="file")).distinct()
|
|
@@ -502,8 +503,8 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
502
503
|
self,
|
|
503
504
|
src: DatasetRecord,
|
|
504
505
|
dst: DatasetRecord,
|
|
505
|
-
src_version:
|
|
506
|
-
dst_version:
|
|
506
|
+
src_version: str,
|
|
507
|
+
dst_version: str,
|
|
507
508
|
) -> None:
|
|
508
509
|
dst_empty = False
|
|
509
510
|
|
|
@@ -534,7 +535,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
534
535
|
dst_previous_versions = [
|
|
535
536
|
v.version
|
|
536
537
|
for v in dst.versions # type: ignore [union-attr]
|
|
537
|
-
if v.version
|
|
538
|
+
if semver.compare(v.version, dst_version) == -1
|
|
538
539
|
]
|
|
539
540
|
if dst_previous_versions:
|
|
540
541
|
dst_version_latest = max(dst_previous_versions)
|
|
@@ -570,7 +571,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
570
571
|
conn=conn,
|
|
571
572
|
)
|
|
572
573
|
|
|
573
|
-
def insert_dataset_rows(self, df, dataset: DatasetRecord, version:
|
|
574
|
+
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
|
|
574
575
|
dr = self.dataset_rows(dataset, version)
|
|
575
576
|
return self.db.insert_dataframe(dr.table.name, df)
|
|
576
577
|
|
|
@@ -595,7 +596,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
595
596
|
return col_type.python_type
|
|
596
597
|
|
|
597
598
|
def dataset_table_export_file_names(
|
|
598
|
-
self, dataset: DatasetRecord, version:
|
|
599
|
+
self, dataset: DatasetRecord, version: str
|
|
599
600
|
) -> list[str]:
|
|
600
601
|
raise NotImplementedError("Exporting dataset table not implemented for SQLite")
|
|
601
602
|
|
|
@@ -603,7 +604,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
603
604
|
self,
|
|
604
605
|
bucket_uri: str,
|
|
605
606
|
dataset: DatasetRecord,
|
|
606
|
-
version:
|
|
607
|
+
version: str,
|
|
607
608
|
client_config=None,
|
|
608
609
|
) -> list[str]:
|
|
609
610
|
raise NotImplementedError("Exporting dataset table not implemented for SQLite")
|
|
@@ -176,7 +176,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
176
176
|
def dataset_rows(
|
|
177
177
|
self,
|
|
178
178
|
dataset: DatasetRecord,
|
|
179
|
-
version: Optional[
|
|
179
|
+
version: Optional[str] = None,
|
|
180
180
|
column: str = "file",
|
|
181
181
|
):
|
|
182
182
|
version = version or dataset.latest_version
|
|
@@ -253,7 +253,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
253
253
|
name = parsed.path if parsed.scheme == "file" else parsed.netloc
|
|
254
254
|
return parsed.scheme, name
|
|
255
255
|
|
|
256
|
-
def dataset_table_name(self, dataset_name: str, version:
|
|
256
|
+
def dataset_table_name(self, dataset_name: str, version: str) -> str:
|
|
257
257
|
prefix = self.DATASET_TABLE_PREFIX
|
|
258
258
|
if Client.is_data_source_uri(dataset_name):
|
|
259
259
|
# for datasets that are created for bucket listing we use different prefix
|
|
@@ -282,7 +282,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
282
282
|
def drop_dataset_rows_table(
|
|
283
283
|
self,
|
|
284
284
|
dataset: DatasetRecord,
|
|
285
|
-
version:
|
|
285
|
+
version: str,
|
|
286
286
|
if_exists: bool = True,
|
|
287
287
|
) -> None:
|
|
288
288
|
"""Drops a dataset rows table for the given dataset name."""
|
|
@@ -295,8 +295,8 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
295
295
|
self,
|
|
296
296
|
src: "DatasetRecord",
|
|
297
297
|
dst: "DatasetRecord",
|
|
298
|
-
src_version:
|
|
299
|
-
dst_version:
|
|
298
|
+
src_version: str,
|
|
299
|
+
dst_version: str,
|
|
300
300
|
) -> None:
|
|
301
301
|
"""
|
|
302
302
|
Merges source dataset rows and current latest destination dataset rows
|
|
@@ -338,15 +338,15 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
338
338
|
|
|
339
339
|
@abstractmethod
|
|
340
340
|
def get_dataset_sources(
|
|
341
|
-
self, dataset: DatasetRecord, version:
|
|
341
|
+
self, dataset: DatasetRecord, version: str
|
|
342
342
|
) -> list[StorageURI]: ...
|
|
343
343
|
|
|
344
344
|
def rename_dataset_table(
|
|
345
345
|
self,
|
|
346
346
|
old_name: str,
|
|
347
347
|
new_name: str,
|
|
348
|
-
old_version:
|
|
349
|
-
new_version:
|
|
348
|
+
old_version: str,
|
|
349
|
+
new_version: str,
|
|
350
350
|
) -> None:
|
|
351
351
|
old_ds_table_name = self.dataset_table_name(old_name, old_version)
|
|
352
352
|
new_ds_table_name = self.dataset_table_name(new_name, new_version)
|
|
@@ -362,7 +362,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
362
362
|
return res[0]
|
|
363
363
|
|
|
364
364
|
def dataset_stats(
|
|
365
|
-
self, dataset: DatasetRecord, version:
|
|
365
|
+
self, dataset: DatasetRecord, version: str
|
|
366
366
|
) -> tuple[Optional[int], Optional[int]]:
|
|
367
367
|
"""
|
|
368
368
|
Returns tuple with dataset stats: total number of rows and total dataset size.
|
|
@@ -399,7 +399,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
399
399
|
"""
|
|
400
400
|
|
|
401
401
|
@abstractmethod
|
|
402
|
-
def insert_dataset_rows(self, df, dataset: DatasetRecord, version:
|
|
402
|
+
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
|
|
403
403
|
"""Inserts dataset rows directly into dataset table"""
|
|
404
404
|
|
|
405
405
|
@abstractmethod
|
|
@@ -418,7 +418,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
418
418
|
|
|
419
419
|
@abstractmethod
|
|
420
420
|
def dataset_table_export_file_names(
|
|
421
|
-
self, dataset: DatasetRecord, version:
|
|
421
|
+
self, dataset: DatasetRecord, version: str
|
|
422
422
|
) -> list[str]:
|
|
423
423
|
"""
|
|
424
424
|
Returns list of file names that will be created when user runs dataset export
|
|
@@ -429,7 +429,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
429
429
|
self,
|
|
430
430
|
bucket_uri: str,
|
|
431
431
|
dataset: DatasetRecord,
|
|
432
|
-
version:
|
|
432
|
+
version: str,
|
|
433
433
|
client_config=None,
|
|
434
434
|
) -> list[str]:
|
|
435
435
|
"""
|