datachain 0.16.4__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +25 -92
- datachain/cli/__init__.py +11 -9
- datachain/cli/commands/datasets.py +1 -1
- datachain/cli/commands/query.py +1 -0
- datachain/cli/commands/show.py +1 -1
- datachain/cli/parser/__init__.py +11 -3
- datachain/data_storage/job.py +1 -0
- datachain/data_storage/metastore.py +105 -94
- datachain/data_storage/sqlite.py +8 -7
- datachain/data_storage/warehouse.py +58 -46
- datachain/dataset.py +88 -45
- datachain/lib/arrow.py +23 -1
- datachain/lib/dataset_info.py +2 -1
- datachain/lib/dc/csv.py +1 -0
- datachain/lib/dc/datachain.py +38 -16
- datachain/lib/dc/datasets.py +28 -7
- datachain/lib/dc/storage.py +10 -2
- datachain/lib/listing.py +2 -0
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +17 -5
- datachain/listing.py +1 -1
- datachain/query/batch.py +40 -39
- datachain/query/dataset.py +42 -41
- datachain/query/dispatch.py +137 -75
- datachain/query/metrics.py +1 -2
- datachain/query/queue.py +1 -11
- datachain/query/session.py +2 -2
- datachain/query/udf.py +1 -1
- datachain/query/utils.py +8 -14
- datachain/remote/studio.py +4 -4
- datachain/semver.py +58 -0
- datachain/studio.py +1 -1
- datachain/utils.py +3 -0
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/METADATA +1 -1
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/RECORD +39 -38
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/WHEEL +1 -1
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -33,6 +33,7 @@ from datachain.cache import Cache
|
|
|
33
33
|
from datachain.client import Client
|
|
34
34
|
from datachain.dataset import (
|
|
35
35
|
DATASET_PREFIX,
|
|
36
|
+
DEFAULT_DATASET_VERSION,
|
|
36
37
|
QUERY_DATASET_PREFIX,
|
|
37
38
|
DatasetDependency,
|
|
38
39
|
DatasetListRecord,
|
|
@@ -79,6 +80,7 @@ DATASET_INTERNAL_ERROR_MESSAGE = "Internal error on creating dataset"
|
|
|
79
80
|
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
|
|
80
81
|
# exit code we use if query script was canceled
|
|
81
82
|
QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
|
|
83
|
+
QUERY_SCRIPT_SIGTERM_EXIT_CODE = -15 # if query script was terminated by SIGTERM
|
|
82
84
|
|
|
83
85
|
# dataset pull
|
|
84
86
|
PULL_DATASET_MAX_THREADS = 5
|
|
@@ -153,9 +155,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
153
155
|
metastore: "AbstractMetastore",
|
|
154
156
|
warehouse: "AbstractWarehouse",
|
|
155
157
|
remote_ds_name: str,
|
|
156
|
-
remote_ds_version:
|
|
158
|
+
remote_ds_version: str,
|
|
157
159
|
local_ds_name: str,
|
|
158
|
-
local_ds_version:
|
|
160
|
+
local_ds_version: str,
|
|
159
161
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
160
162
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
161
163
|
progress_bar=None,
|
|
@@ -285,7 +287,7 @@ class NodeGroup:
|
|
|
285
287
|
# (not including the bucket name or s3:// prefix)
|
|
286
288
|
source_path: str = ""
|
|
287
289
|
dataset_name: Optional[str] = None
|
|
288
|
-
dataset_version: Optional[
|
|
290
|
+
dataset_version: Optional[str] = None
|
|
289
291
|
instantiated_nodes: Optional[list[NodeWithPath]] = None
|
|
290
292
|
|
|
291
293
|
@property
|
|
@@ -606,7 +608,7 @@ class Catalog:
|
|
|
606
608
|
return lst, client, list_path
|
|
607
609
|
|
|
608
610
|
def _remove_dataset_rows_and_warehouse_info(
|
|
609
|
-
self, dataset: DatasetRecord, version:
|
|
611
|
+
self, dataset: DatasetRecord, version: str, **kwargs
|
|
610
612
|
):
|
|
611
613
|
self.warehouse.drop_dataset_rows_table(dataset, version)
|
|
612
614
|
self.update_dataset_version_with_warehouse_info(
|
|
@@ -766,7 +768,7 @@ class Catalog:
|
|
|
766
768
|
def create_dataset(
|
|
767
769
|
self,
|
|
768
770
|
name: str,
|
|
769
|
-
version: Optional[
|
|
771
|
+
version: Optional[str] = None,
|
|
770
772
|
*,
|
|
771
773
|
columns: Sequence[Column],
|
|
772
774
|
feature_schema: Optional[dict] = None,
|
|
@@ -782,18 +784,17 @@ class Catalog:
|
|
|
782
784
|
Creates new dataset of a specific version.
|
|
783
785
|
If dataset is not yet created, it will create it with version 1
|
|
784
786
|
If version is None, then next unused version is created.
|
|
785
|
-
If version is given, then it must be an unused version
|
|
787
|
+
If version is given, then it must be an unused version.
|
|
786
788
|
"""
|
|
787
789
|
assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
|
|
788
790
|
if not listing and Client.is_data_source_uri(name):
|
|
789
791
|
raise RuntimeError(
|
|
790
792
|
"Cannot create dataset that starts with source prefix, e.g s3://"
|
|
791
793
|
)
|
|
792
|
-
default_version =
|
|
794
|
+
default_version = DEFAULT_DATASET_VERSION
|
|
793
795
|
try:
|
|
794
796
|
dataset = self.get_dataset(name)
|
|
795
|
-
default_version = dataset.
|
|
796
|
-
|
|
797
|
+
default_version = dataset.next_version_patch
|
|
797
798
|
if (description or attrs) and (
|
|
798
799
|
dataset.description != description or dataset.attrs != attrs
|
|
799
800
|
):
|
|
@@ -845,7 +846,7 @@ class Catalog:
|
|
|
845
846
|
def create_new_dataset_version(
|
|
846
847
|
self,
|
|
847
848
|
dataset: DatasetRecord,
|
|
848
|
-
version:
|
|
849
|
+
version: str,
|
|
849
850
|
*,
|
|
850
851
|
columns: Sequence[Column],
|
|
851
852
|
sources="",
|
|
@@ -891,7 +892,7 @@ class Catalog:
|
|
|
891
892
|
return dataset
|
|
892
893
|
|
|
893
894
|
def update_dataset_version_with_warehouse_info(
|
|
894
|
-
self, dataset: DatasetRecord, version:
|
|
895
|
+
self, dataset: DatasetRecord, version: str, rows_dropped=False, **kwargs
|
|
895
896
|
) -> None:
|
|
896
897
|
from datachain.query.dataset import DatasetQuery
|
|
897
898
|
|
|
@@ -958,7 +959,7 @@ class Catalog:
|
|
|
958
959
|
return dataset
|
|
959
960
|
|
|
960
961
|
def remove_dataset_version(
|
|
961
|
-
self, dataset: DatasetRecord, version:
|
|
962
|
+
self, dataset: DatasetRecord, version: str, drop_rows: Optional[bool] = True
|
|
962
963
|
) -> None:
|
|
963
964
|
"""
|
|
964
965
|
Deletes one single dataset version.
|
|
@@ -1036,82 +1037,11 @@ class Catalog:
|
|
|
1036
1037
|
|
|
1037
1038
|
return self.get_dataset(name)
|
|
1038
1039
|
|
|
1039
|
-
def register_dataset(
|
|
1040
|
-
self,
|
|
1041
|
-
dataset: DatasetRecord,
|
|
1042
|
-
version: int,
|
|
1043
|
-
target_dataset: DatasetRecord,
|
|
1044
|
-
target_version: Optional[int] = None,
|
|
1045
|
-
) -> DatasetRecord:
|
|
1046
|
-
"""
|
|
1047
|
-
Registers dataset version of one dataset as dataset version of another
|
|
1048
|
-
one (it can be new version of existing one).
|
|
1049
|
-
It also removes original dataset version
|
|
1050
|
-
"""
|
|
1051
|
-
target_version = target_version or target_dataset.next_version
|
|
1052
|
-
|
|
1053
|
-
if not target_dataset.is_valid_next_version(target_version):
|
|
1054
|
-
raise DatasetInvalidVersionError(
|
|
1055
|
-
f"Version {target_version} must be higher than the current latest one"
|
|
1056
|
-
)
|
|
1057
|
-
|
|
1058
|
-
dataset_version = dataset.get_version(version)
|
|
1059
|
-
if not dataset_version:
|
|
1060
|
-
raise DatasetVersionNotFoundError(
|
|
1061
|
-
f"Dataset {dataset.name} does not have version {version}"
|
|
1062
|
-
)
|
|
1063
|
-
|
|
1064
|
-
if not dataset_version.is_final_status():
|
|
1065
|
-
raise ValueError("Cannot register dataset version in non final status")
|
|
1066
|
-
|
|
1067
|
-
# copy dataset version
|
|
1068
|
-
target_dataset = self.metastore.create_dataset_version(
|
|
1069
|
-
target_dataset,
|
|
1070
|
-
target_version,
|
|
1071
|
-
sources=dataset_version.sources,
|
|
1072
|
-
status=dataset_version.status,
|
|
1073
|
-
query_script=dataset_version.query_script,
|
|
1074
|
-
error_message=dataset_version.error_message,
|
|
1075
|
-
error_stack=dataset_version.error_stack,
|
|
1076
|
-
script_output=dataset_version.script_output,
|
|
1077
|
-
created_at=dataset_version.created_at,
|
|
1078
|
-
finished_at=dataset_version.finished_at,
|
|
1079
|
-
schema=dataset_version.serialized_schema,
|
|
1080
|
-
num_objects=dataset_version.num_objects,
|
|
1081
|
-
size=dataset_version.size,
|
|
1082
|
-
preview=dataset_version.preview,
|
|
1083
|
-
job_id=dataset_version.job_id,
|
|
1084
|
-
)
|
|
1085
|
-
|
|
1086
|
-
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1087
|
-
# of target dataset
|
|
1088
|
-
self.warehouse.rename_dataset_table(
|
|
1089
|
-
dataset.name,
|
|
1090
|
-
target_dataset.name,
|
|
1091
|
-
old_version=version,
|
|
1092
|
-
new_version=target_version,
|
|
1093
|
-
)
|
|
1094
|
-
self.metastore.update_dataset_dependency_source(
|
|
1095
|
-
dataset,
|
|
1096
|
-
version,
|
|
1097
|
-
new_source_dataset=target_dataset,
|
|
1098
|
-
new_source_dataset_version=target_version,
|
|
1099
|
-
)
|
|
1100
|
-
|
|
1101
|
-
if dataset.id == target_dataset.id:
|
|
1102
|
-
# we are updating the same dataset so we need to refresh it to have newly
|
|
1103
|
-
# added version in step before
|
|
1104
|
-
dataset = self.get_dataset(dataset.name)
|
|
1105
|
-
|
|
1106
|
-
self.remove_dataset_version(dataset, version, drop_rows=False)
|
|
1107
|
-
|
|
1108
|
-
return self.get_dataset(target_dataset.name)
|
|
1109
|
-
|
|
1110
1040
|
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1111
1041
|
return self.metastore.get_dataset(name)
|
|
1112
1042
|
|
|
1113
1043
|
def get_dataset_with_remote_fallback(
|
|
1114
|
-
self, name: str, version: Optional[
|
|
1044
|
+
self, name: str, version: Optional[str] = None
|
|
1115
1045
|
) -> DatasetRecord:
|
|
1116
1046
|
try:
|
|
1117
1047
|
ds = self.get_dataset(name)
|
|
@@ -1156,7 +1086,7 @@ class Catalog:
|
|
|
1156
1086
|
return DatasetRecord.from_dict(dataset_info)
|
|
1157
1087
|
|
|
1158
1088
|
def get_dataset_dependencies(
|
|
1159
|
-
self, name: str, version:
|
|
1089
|
+
self, name: str, version: str, indirect=False
|
|
1160
1090
|
) -> list[Optional[DatasetDependency]]:
|
|
1161
1091
|
dataset = self.get_dataset(name)
|
|
1162
1092
|
|
|
@@ -1174,7 +1104,7 @@ class Catalog:
|
|
|
1174
1104
|
if d.is_dataset:
|
|
1175
1105
|
# only datasets can have dependencies
|
|
1176
1106
|
d.dependencies = self.get_dataset_dependencies(
|
|
1177
|
-
d.name,
|
|
1107
|
+
d.name, d.version, indirect=indirect
|
|
1178
1108
|
)
|
|
1179
1109
|
|
|
1180
1110
|
return direct_dependencies
|
|
@@ -1243,7 +1173,7 @@ class Catalog:
|
|
|
1243
1173
|
]
|
|
1244
1174
|
|
|
1245
1175
|
def ls_dataset_rows(
|
|
1246
|
-
self, name: str, version:
|
|
1176
|
+
self, name: str, version: str, offset=None, limit=None
|
|
1247
1177
|
) -> list[dict]:
|
|
1248
1178
|
from datachain.query.dataset import DatasetQuery
|
|
1249
1179
|
|
|
@@ -1281,7 +1211,7 @@ class Catalog:
|
|
|
1281
1211
|
self,
|
|
1282
1212
|
bucket_uri: str,
|
|
1283
1213
|
name: str,
|
|
1284
|
-
version:
|
|
1214
|
+
version: str,
|
|
1285
1215
|
client_config=None,
|
|
1286
1216
|
) -> list[str]:
|
|
1287
1217
|
dataset = self.get_dataset(name)
|
|
@@ -1290,14 +1220,14 @@ class Catalog:
|
|
|
1290
1220
|
bucket_uri, dataset, version, client_config
|
|
1291
1221
|
)
|
|
1292
1222
|
|
|
1293
|
-
def dataset_table_export_file_names(self, name: str, version:
|
|
1223
|
+
def dataset_table_export_file_names(self, name: str, version: str) -> list[str]:
|
|
1294
1224
|
dataset = self.get_dataset(name)
|
|
1295
1225
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1296
1226
|
|
|
1297
1227
|
def remove_dataset(
|
|
1298
1228
|
self,
|
|
1299
1229
|
name: str,
|
|
1300
|
-
version: Optional[
|
|
1230
|
+
version: Optional[str] = None,
|
|
1301
1231
|
force: Optional[bool] = False,
|
|
1302
1232
|
studio: Optional[bool] = False,
|
|
1303
1233
|
):
|
|
@@ -1371,7 +1301,7 @@ class Catalog:
|
|
|
1371
1301
|
remote_ds_uri: str,
|
|
1372
1302
|
output: Optional[str] = None,
|
|
1373
1303
|
local_ds_name: Optional[str] = None,
|
|
1374
|
-
local_ds_version: Optional[
|
|
1304
|
+
local_ds_version: Optional[str] = None,
|
|
1375
1305
|
cp: bool = False,
|
|
1376
1306
|
force: bool = False,
|
|
1377
1307
|
*,
|
|
@@ -1645,7 +1575,10 @@ class Catalog:
|
|
|
1645
1575
|
thread.join() # wait for the reader thread
|
|
1646
1576
|
|
|
1647
1577
|
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1648
|
-
if proc.returncode
|
|
1578
|
+
if proc.returncode in (
|
|
1579
|
+
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
1580
|
+
QUERY_SCRIPT_SIGTERM_EXIT_CODE,
|
|
1581
|
+
):
|
|
1649
1582
|
raise QueryScriptCancelError(
|
|
1650
1583
|
"Query script was canceled by user",
|
|
1651
1584
|
return_code=proc.returncode,
|
datachain/cli/__init__.py
CHANGED
|
@@ -34,8 +34,10 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
34
34
|
datachain_parser = get_parser()
|
|
35
35
|
args = datachain_parser.parse_args(argv)
|
|
36
36
|
|
|
37
|
-
if args.command
|
|
38
|
-
return handle_udf(
|
|
37
|
+
if args.command == "internal-run-udf":
|
|
38
|
+
return handle_udf()
|
|
39
|
+
if args.command == "internal-run-udf-worker":
|
|
40
|
+
return handle_udf_runner(args.fd)
|
|
39
41
|
|
|
40
42
|
if args.command is None:
|
|
41
43
|
datachain_parser.print_help(sys.stderr)
|
|
@@ -303,13 +305,13 @@ def handle_general_exception(exc, args, logging_level):
|
|
|
303
305
|
return error, 1
|
|
304
306
|
|
|
305
307
|
|
|
306
|
-
def handle_udf(
|
|
307
|
-
|
|
308
|
-
from datachain.query.dispatch import udf_entrypoint
|
|
308
|
+
def handle_udf() -> int:
|
|
309
|
+
from datachain.query.dispatch import udf_entrypoint
|
|
309
310
|
|
|
310
|
-
|
|
311
|
+
return udf_entrypoint()
|
|
311
312
|
|
|
312
|
-
if command == "internal-run-udf-worker":
|
|
313
|
-
from datachain.query.dispatch import udf_worker_entrypoint
|
|
314
313
|
|
|
315
|
-
|
|
314
|
+
def handle_udf_runner(fd: Optional[int] = None) -> int:
|
|
315
|
+
from datachain.query.dispatch import udf_worker_entrypoint
|
|
316
|
+
|
|
317
|
+
return udf_worker_entrypoint(fd)
|
|
@@ -127,7 +127,7 @@ def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
|
127
127
|
def rm_dataset(
|
|
128
128
|
catalog: "Catalog",
|
|
129
129
|
name: str,
|
|
130
|
-
version: Optional[
|
|
130
|
+
version: Optional[str] = None,
|
|
131
131
|
force: Optional[bool] = False,
|
|
132
132
|
studio: bool = False,
|
|
133
133
|
local: bool = False,
|
datachain/cli/commands/query.py
CHANGED
datachain/cli/commands/show.py
CHANGED
datachain/cli/parser/__init__.py
CHANGED
|
@@ -302,7 +302,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
302
302
|
"--version",
|
|
303
303
|
action="store",
|
|
304
304
|
default=None,
|
|
305
|
-
type=
|
|
305
|
+
type=str,
|
|
306
306
|
help="Dataset version",
|
|
307
307
|
)
|
|
308
308
|
rm_dataset_parser.add_argument(
|
|
@@ -495,7 +495,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
495
495
|
"--version",
|
|
496
496
|
action="store",
|
|
497
497
|
default=None,
|
|
498
|
-
type=
|
|
498
|
+
type=str,
|
|
499
499
|
help="Dataset version",
|
|
500
500
|
)
|
|
501
501
|
show_parser.add_argument("--schema", action="store_true", help="Show schema")
|
|
@@ -549,7 +549,15 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
549
549
|
add_anon_arg(parse_gc)
|
|
550
550
|
|
|
551
551
|
subp.add_parser("internal-run-udf", parents=[parent_parser])
|
|
552
|
-
subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
|
|
552
|
+
run_udf_worker = subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
|
|
553
|
+
run_udf_worker.add_argument(
|
|
554
|
+
"--fd",
|
|
555
|
+
type=int,
|
|
556
|
+
action="store",
|
|
557
|
+
default=None,
|
|
558
|
+
help="File descriptor to write results to",
|
|
559
|
+
)
|
|
560
|
+
|
|
553
561
|
add_completion_parser(subp, [parent_parser])
|
|
554
562
|
return parser
|
|
555
563
|
|