datachain 0.8.13__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +10 -0
- datachain/catalog/catalog.py +32 -9
- datachain/cli/__init__.py +2 -0
- datachain/cli/commands/datasets.py +78 -12
- datachain/cli/parser/__init__.py +62 -12
- datachain/cli/parser/job.py +14 -4
- datachain/cli/parser/studio.py +8 -0
- datachain/cli/parser/utils.py +20 -1
- datachain/dataset.py +7 -4
- datachain/diff/__init__.py +78 -128
- datachain/fs/reference.py +21 -0
- datachain/func/__init__.py +3 -1
- datachain/func/conditional.py +66 -2
- datachain/job.py +1 -1
- datachain/lib/arrow.py +1 -11
- datachain/lib/dc.py +2 -0
- datachain/lib/file.py +292 -5
- datachain/lib/hf.py +1 -1
- datachain/lib/video.py +223 -0
- datachain/query/dataset.py +28 -3
- datachain/remote/studio.py +13 -6
- datachain/studio.py +34 -12
- datachain/utils.py +12 -2
- {datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/METADATA +13 -5
- {datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/RECORD +30 -28
- /datachain/{lib/vfile.py → fs/__init__.py} +0 -0
- {datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/LICENSE +0 -0
- {datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/WHEEL +0 -0
- {datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -4,9 +4,14 @@ from datachain.lib.file import (
|
|
|
4
4
|
ArrowRow,
|
|
5
5
|
File,
|
|
6
6
|
FileError,
|
|
7
|
+
Image,
|
|
7
8
|
ImageFile,
|
|
8
9
|
TarVFile,
|
|
9
10
|
TextFile,
|
|
11
|
+
Video,
|
|
12
|
+
VideoFile,
|
|
13
|
+
VideoFragment,
|
|
14
|
+
VideoFrame,
|
|
10
15
|
)
|
|
11
16
|
from datachain.lib.model_store import ModelStore
|
|
12
17
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
@@ -27,6 +32,7 @@ __all__ = [
|
|
|
27
32
|
"File",
|
|
28
33
|
"FileError",
|
|
29
34
|
"Generator",
|
|
35
|
+
"Image",
|
|
30
36
|
"ImageFile",
|
|
31
37
|
"Mapper",
|
|
32
38
|
"ModelStore",
|
|
@@ -34,6 +40,10 @@ __all__ = [
|
|
|
34
40
|
"Sys",
|
|
35
41
|
"TarVFile",
|
|
36
42
|
"TextFile",
|
|
43
|
+
"Video",
|
|
44
|
+
"VideoFile",
|
|
45
|
+
"VideoFragment",
|
|
46
|
+
"VideoFrame",
|
|
37
47
|
"is_chain_type",
|
|
38
48
|
"metrics",
|
|
39
49
|
"param",
|
datachain/catalog/catalog.py
CHANGED
|
@@ -89,10 +89,6 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be av
|
|
|
89
89
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
def raise_remote_error(error_message: str) -> NoReturn:
|
|
93
|
-
raise DataChainError(f"Error from server: {error_message}")
|
|
94
|
-
|
|
95
|
-
|
|
96
92
|
def noop(_: str):
|
|
97
93
|
pass
|
|
98
94
|
|
|
@@ -211,14 +207,14 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
211
207
|
self.remote_ds_name, self.remote_ds_version
|
|
212
208
|
)
|
|
213
209
|
if not export_status_response.ok:
|
|
214
|
-
|
|
210
|
+
raise DataChainError(export_status_response.message)
|
|
215
211
|
|
|
216
212
|
export_status = export_status_response.data["status"] # type: ignore [index]
|
|
217
213
|
|
|
218
214
|
if export_status == "failed":
|
|
219
|
-
|
|
215
|
+
raise DataChainError("Dataset export failed in Studio")
|
|
220
216
|
if export_status == "removed":
|
|
221
|
-
|
|
217
|
+
raise DataChainError("Dataset export removed in Studio")
|
|
222
218
|
|
|
223
219
|
self.last_status_check = time.time()
|
|
224
220
|
|
|
@@ -1101,6 +1097,31 @@ class Catalog:
|
|
|
1101
1097
|
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1102
1098
|
return self.metastore.get_dataset(name)
|
|
1103
1099
|
|
|
1100
|
+
def get_dataset_with_remote_fallback(
|
|
1101
|
+
self, name: str, version: Optional[int] = None
|
|
1102
|
+
) -> DatasetRecord:
|
|
1103
|
+
try:
|
|
1104
|
+
ds = self.get_dataset(name)
|
|
1105
|
+
if version and not ds.has_version(version):
|
|
1106
|
+
raise DatasetVersionNotFoundError(
|
|
1107
|
+
f"Dataset {name} does not have version {version}"
|
|
1108
|
+
)
|
|
1109
|
+
return ds
|
|
1110
|
+
|
|
1111
|
+
except (DatasetNotFoundError, DatasetVersionNotFoundError):
|
|
1112
|
+
print("Dataset not found in local catalog, trying to get from studio")
|
|
1113
|
+
|
|
1114
|
+
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
1115
|
+
if version:
|
|
1116
|
+
remote_ds_uri += f"@v{version}"
|
|
1117
|
+
|
|
1118
|
+
self.pull_dataset(
|
|
1119
|
+
remote_ds_uri=remote_ds_uri,
|
|
1120
|
+
local_ds_name=name,
|
|
1121
|
+
local_ds_version=version,
|
|
1122
|
+
)
|
|
1123
|
+
return self.get_dataset(name)
|
|
1124
|
+
|
|
1104
1125
|
def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
|
|
1105
1126
|
"""Returns dataset that contains version with specific uuid"""
|
|
1106
1127
|
for dataset in self.ls_datasets():
|
|
@@ -1113,7 +1134,7 @@ class Catalog:
|
|
|
1113
1134
|
|
|
1114
1135
|
info_response = studio_client.dataset_info(name)
|
|
1115
1136
|
if not info_response.ok:
|
|
1116
|
-
|
|
1137
|
+
raise DataChainError(info_response.message)
|
|
1117
1138
|
|
|
1118
1139
|
dataset_info = info_response.data
|
|
1119
1140
|
assert isinstance(dataset_info, dict)
|
|
@@ -1209,6 +1230,8 @@ class Catalog:
|
|
|
1209
1230
|
**kwargs,
|
|
1210
1231
|
) -> str:
|
|
1211
1232
|
client_config = client_config or self.client_config
|
|
1233
|
+
if client_config.get("anon"):
|
|
1234
|
+
content_disposition = None
|
|
1212
1235
|
client = Client.get_client(source, self.cache, **client_config)
|
|
1213
1236
|
return client.url(
|
|
1214
1237
|
path,
|
|
@@ -1407,7 +1430,7 @@ class Catalog:
|
|
|
1407
1430
|
remote_ds_name, remote_ds_version.version
|
|
1408
1431
|
)
|
|
1409
1432
|
if not export_response.ok:
|
|
1410
|
-
|
|
1433
|
+
raise DataChainError(export_response.message)
|
|
1411
1434
|
|
|
1412
1435
|
signed_urls = export_response.data
|
|
1413
1436
|
|
datachain/cli/__init__.py
CHANGED
|
@@ -12,49 +12,115 @@ from datachain.error import DatasetNotFoundError
|
|
|
12
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def group_dataset_versions(datasets, latest_only=True):
|
|
16
|
+
grouped = {}
|
|
17
|
+
# Sort to ensure groupby works as expected
|
|
18
|
+
# (groupby expects consecutive items with the same key)
|
|
19
|
+
for name, version in sorted(datasets):
|
|
20
|
+
grouped.setdefault(name, []).append(version)
|
|
21
|
+
|
|
22
|
+
if latest_only:
|
|
23
|
+
# For each dataset name, pick the highest version.
|
|
24
|
+
return {name: max(versions) for name, versions in grouped.items()}
|
|
25
|
+
# For each dataset name, return a sorted list of unique versions.
|
|
26
|
+
return {name: sorted(set(versions)) for name, versions in grouped.items()}
|
|
27
|
+
|
|
28
|
+
|
|
15
29
|
def list_datasets(
|
|
16
30
|
catalog: "Catalog",
|
|
17
31
|
studio: bool = False,
|
|
18
32
|
local: bool = False,
|
|
19
33
|
all: bool = True,
|
|
20
34
|
team: Optional[str] = None,
|
|
35
|
+
latest_only: bool = True,
|
|
36
|
+
name: Optional[str] = None,
|
|
21
37
|
):
|
|
22
38
|
token = Config().read().get("studio", {}).get("token")
|
|
23
39
|
all, local, studio = determine_flavors(studio, local, all, token)
|
|
40
|
+
if name:
|
|
41
|
+
latest_only = False
|
|
24
42
|
|
|
25
|
-
local_datasets = set(list_datasets_local(catalog)) if all or local else set()
|
|
43
|
+
local_datasets = set(list_datasets_local(catalog, name)) if all or local else set()
|
|
26
44
|
studio_datasets = (
|
|
27
|
-
set(list_datasets_studio(team=team
|
|
45
|
+
set(list_datasets_studio(team=team, name=name))
|
|
46
|
+
if (all or studio) and token
|
|
47
|
+
else set()
|
|
28
48
|
)
|
|
29
49
|
|
|
50
|
+
# Group the datasets for both local and studio sources.
|
|
51
|
+
local_grouped = group_dataset_versions(local_datasets, latest_only)
|
|
52
|
+
studio_grouped = group_dataset_versions(studio_datasets, latest_only)
|
|
53
|
+
|
|
54
|
+
# Merge all dataset names from both sources.
|
|
55
|
+
all_dataset_names = sorted(set(local_grouped.keys()) | set(studio_grouped.keys()))
|
|
56
|
+
|
|
57
|
+
datasets = []
|
|
58
|
+
if latest_only:
|
|
59
|
+
# For each dataset name, get the latest version from each source (if available).
|
|
60
|
+
for n in all_dataset_names:
|
|
61
|
+
datasets.append((n, (local_grouped.get(n), studio_grouped.get(n))))
|
|
62
|
+
else:
|
|
63
|
+
# For each dataset name, merge all versions from both sources.
|
|
64
|
+
for n in all_dataset_names:
|
|
65
|
+
local_versions = local_grouped.get(n, [])
|
|
66
|
+
studio_versions = studio_grouped.get(n, [])
|
|
67
|
+
|
|
68
|
+
# If neither source has any versions, record it as (None, None).
|
|
69
|
+
if not local_versions and not studio_versions:
|
|
70
|
+
datasets.append((n, (None, None)))
|
|
71
|
+
else:
|
|
72
|
+
# For each unique version from either source, record its presence.
|
|
73
|
+
for version in sorted(set(local_versions) | set(studio_versions)):
|
|
74
|
+
datasets.append(
|
|
75
|
+
(
|
|
76
|
+
n,
|
|
77
|
+
(
|
|
78
|
+
version if version in local_versions else None,
|
|
79
|
+
version if version in studio_versions else None,
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
|
|
30
84
|
rows = [
|
|
31
85
|
_datasets_tabulate_row(
|
|
32
|
-
name=
|
|
33
|
-
version=version,
|
|
86
|
+
name=n,
|
|
34
87
|
both=(all or (local and studio)) and token,
|
|
35
|
-
|
|
36
|
-
|
|
88
|
+
local_version=local_version,
|
|
89
|
+
studio_version=studio_version,
|
|
37
90
|
)
|
|
38
|
-
for
|
|
91
|
+
for n, (local_version, studio_version) in datasets
|
|
39
92
|
]
|
|
40
93
|
|
|
41
94
|
print(tabulate(rows, headers="keys"))
|
|
42
95
|
|
|
43
96
|
|
|
44
|
-
def list_datasets_local(catalog: "Catalog"):
|
|
97
|
+
def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
|
|
98
|
+
if name:
|
|
99
|
+
yield from list_datasets_local_versions(catalog, name)
|
|
100
|
+
return
|
|
101
|
+
|
|
45
102
|
for d in catalog.ls_datasets():
|
|
46
103
|
for v in d.versions:
|
|
47
104
|
yield (d.name, v.version)
|
|
48
105
|
|
|
49
106
|
|
|
50
|
-
def
|
|
107
|
+
def list_datasets_local_versions(catalog: "Catalog", name: str):
|
|
108
|
+
ds = catalog.get_dataset(name)
|
|
109
|
+
for v in ds.versions:
|
|
110
|
+
yield (name, v.version)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
51
114
|
row = {
|
|
52
115
|
"Name": name,
|
|
53
|
-
"Version": version,
|
|
54
116
|
}
|
|
55
117
|
if both:
|
|
56
|
-
row["Studio"] = "
|
|
57
|
-
row["Local"] = "
|
|
118
|
+
row["Studio"] = f"v{studio_version}" if studio_version else "\u2716"
|
|
119
|
+
row["Local"] = f"v{local_version}" if local_version else "\u2716"
|
|
120
|
+
else:
|
|
121
|
+
latest_version = local_version or studio_version
|
|
122
|
+
row["Latest Version"] = f"v{latest_version}" if latest_version else "\u2716"
|
|
123
|
+
|
|
58
124
|
return row
|
|
59
125
|
|
|
60
126
|
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
from argparse import ArgumentParser
|
|
3
2
|
from importlib.metadata import PackageNotFoundError, version
|
|
4
3
|
|
|
5
4
|
import shtab
|
|
@@ -10,12 +9,16 @@ from .job import add_jobs_parser
|
|
|
10
9
|
from .studio import add_auth_parser
|
|
11
10
|
from .utils import (
|
|
12
11
|
FIND_COLUMNS,
|
|
12
|
+
CustomHelpFormatter,
|
|
13
13
|
add_anon_arg,
|
|
14
14
|
add_show_args,
|
|
15
15
|
add_sources_arg,
|
|
16
16
|
add_update_arg,
|
|
17
17
|
find_columns_type,
|
|
18
18
|
)
|
|
19
|
+
from .utils import (
|
|
20
|
+
CustomArgumentParser as ArgumentParser,
|
|
21
|
+
)
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
@@ -28,10 +31,11 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
28
31
|
parser = ArgumentParser(
|
|
29
32
|
description="DataChain: Wrangle unstructured AI data at scale.",
|
|
30
33
|
prog="datachain",
|
|
34
|
+
formatter_class=CustomHelpFormatter,
|
|
31
35
|
)
|
|
32
36
|
parser.add_argument("-V", "--version", action="version", version=__version__)
|
|
33
37
|
|
|
34
|
-
parent_parser = ArgumentParser(add_help=False)
|
|
38
|
+
parent_parser = ArgumentParser(add_help=False, formatter_class=CustomHelpFormatter)
|
|
35
39
|
parent_parser.add_argument(
|
|
36
40
|
"-v", "--verbose", action="count", default=0, help="Be verbose"
|
|
37
41
|
)
|
|
@@ -59,7 +63,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
59
63
|
help=f"Use `{parser.prog} command --help` for command-specific help",
|
|
60
64
|
)
|
|
61
65
|
parse_cp = subp.add_parser(
|
|
62
|
-
"cp",
|
|
66
|
+
"cp",
|
|
67
|
+
parents=[parent_parser],
|
|
68
|
+
description="Copy data files from the cloud.",
|
|
69
|
+
formatter_class=CustomHelpFormatter,
|
|
63
70
|
)
|
|
64
71
|
add_sources_arg(parse_cp).complete = shtab.DIR # type: ignore[attr-defined]
|
|
65
72
|
parse_cp.add_argument(
|
|
@@ -90,7 +97,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
90
97
|
add_update_arg(parse_cp)
|
|
91
98
|
|
|
92
99
|
parse_clone = subp.add_parser(
|
|
93
|
-
"clone",
|
|
100
|
+
"clone",
|
|
101
|
+
parents=[parent_parser],
|
|
102
|
+
description="Copy data files from the cloud.",
|
|
103
|
+
formatter_class=CustomHelpFormatter,
|
|
94
104
|
)
|
|
95
105
|
add_sources_arg(parse_clone).complete = shtab.DIR # type: ignore[attr-defined]
|
|
96
106
|
parse_clone.add_argument(
|
|
@@ -134,6 +144,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
134
144
|
aliases=["ds"],
|
|
135
145
|
parents=[parent_parser],
|
|
136
146
|
description="Commands for managing datasets.",
|
|
147
|
+
formatter_class=CustomHelpFormatter,
|
|
137
148
|
)
|
|
138
149
|
add_anon_arg(datasets_parser)
|
|
139
150
|
datasets_subparser = datasets_parser.add_subparsers(
|
|
@@ -145,6 +156,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
145
156
|
"pull",
|
|
146
157
|
parents=[parent_parser],
|
|
147
158
|
description="Pull specific dataset version from Studio.",
|
|
159
|
+
formatter_class=CustomHelpFormatter,
|
|
148
160
|
)
|
|
149
161
|
parse_pull.add_argument(
|
|
150
162
|
"dataset",
|
|
@@ -188,7 +200,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
188
200
|
)
|
|
189
201
|
|
|
190
202
|
parse_edit_dataset = datasets_subparser.add_parser(
|
|
191
|
-
"edit",
|
|
203
|
+
"edit",
|
|
204
|
+
parents=[parent_parser],
|
|
205
|
+
description="Edit dataset metadata.",
|
|
206
|
+
formatter_class=CustomHelpFormatter,
|
|
192
207
|
)
|
|
193
208
|
parse_edit_dataset.add_argument("name", type=str, help="Dataset name")
|
|
194
209
|
parse_edit_dataset.add_argument(
|
|
@@ -234,7 +249,19 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
234
249
|
)
|
|
235
250
|
|
|
236
251
|
datasets_ls_parser = datasets_subparser.add_parser(
|
|
237
|
-
"ls",
|
|
252
|
+
"ls",
|
|
253
|
+
parents=[parent_parser],
|
|
254
|
+
description="List datasets.",
|
|
255
|
+
formatter_class=CustomHelpFormatter,
|
|
256
|
+
)
|
|
257
|
+
datasets_ls_parser.add_argument(
|
|
258
|
+
"name", action="store", help="Name of the dataset to list", nargs="?"
|
|
259
|
+
)
|
|
260
|
+
datasets_ls_parser.add_argument(
|
|
261
|
+
"--versions",
|
|
262
|
+
action="store_true",
|
|
263
|
+
default=False,
|
|
264
|
+
help="List all the versions of each dataset",
|
|
238
265
|
)
|
|
239
266
|
datasets_ls_parser.add_argument(
|
|
240
267
|
"--studio",
|
|
@@ -264,7 +291,11 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
264
291
|
)
|
|
265
292
|
|
|
266
293
|
rm_dataset_parser = datasets_subparser.add_parser(
|
|
267
|
-
"rm",
|
|
294
|
+
"rm",
|
|
295
|
+
parents=[parent_parser],
|
|
296
|
+
description="Remove dataset.",
|
|
297
|
+
aliases=["remove"],
|
|
298
|
+
formatter_class=CustomHelpFormatter,
|
|
268
299
|
)
|
|
269
300
|
rm_dataset_parser.add_argument("name", type=str, help="Dataset name")
|
|
270
301
|
rm_dataset_parser.add_argument(
|
|
@@ -308,7 +339,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
308
339
|
)
|
|
309
340
|
|
|
310
341
|
parse_ls = subp.add_parser(
|
|
311
|
-
"ls",
|
|
342
|
+
"ls",
|
|
343
|
+
parents=[parent_parser],
|
|
344
|
+
description="List storage contents.",
|
|
345
|
+
formatter_class=CustomHelpFormatter,
|
|
312
346
|
)
|
|
313
347
|
add_anon_arg(parse_ls)
|
|
314
348
|
add_update_arg(parse_ls)
|
|
@@ -348,7 +382,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
348
382
|
)
|
|
349
383
|
|
|
350
384
|
parse_du = subp.add_parser(
|
|
351
|
-
"du",
|
|
385
|
+
"du",
|
|
386
|
+
parents=[parent_parser],
|
|
387
|
+
description="Display space usage.",
|
|
388
|
+
formatter_class=CustomHelpFormatter,
|
|
352
389
|
)
|
|
353
390
|
add_sources_arg(parse_du)
|
|
354
391
|
add_anon_arg(parse_du)
|
|
@@ -380,7 +417,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
380
417
|
)
|
|
381
418
|
|
|
382
419
|
parse_find = subp.add_parser(
|
|
383
|
-
"find",
|
|
420
|
+
"find",
|
|
421
|
+
parents=[parent_parser],
|
|
422
|
+
description="Search in a directory hierarchy.",
|
|
423
|
+
formatter_class=CustomHelpFormatter,
|
|
384
424
|
)
|
|
385
425
|
add_anon_arg(parse_find)
|
|
386
426
|
add_update_arg(parse_find)
|
|
@@ -435,7 +475,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
435
475
|
)
|
|
436
476
|
|
|
437
477
|
parse_index = subp.add_parser(
|
|
438
|
-
"index",
|
|
478
|
+
"index",
|
|
479
|
+
parents=[parent_parser],
|
|
480
|
+
description="Index storage location.",
|
|
481
|
+
formatter_class=CustomHelpFormatter,
|
|
439
482
|
)
|
|
440
483
|
add_anon_arg(parse_index)
|
|
441
484
|
add_update_arg(parse_index)
|
|
@@ -445,6 +488,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
445
488
|
"show",
|
|
446
489
|
parents=[parent_parser],
|
|
447
490
|
description="Create a new dataset with a query script.",
|
|
491
|
+
formatter_class=CustomHelpFormatter,
|
|
448
492
|
)
|
|
449
493
|
show_parser.add_argument("name", type=str, help="Dataset name")
|
|
450
494
|
show_parser.add_argument(
|
|
@@ -461,6 +505,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
461
505
|
"query",
|
|
462
506
|
parents=[parent_parser],
|
|
463
507
|
description="Create a new dataset with a query script.",
|
|
508
|
+
formatter_class=CustomHelpFormatter,
|
|
464
509
|
)
|
|
465
510
|
add_anon_arg(query_parser)
|
|
466
511
|
query_parser.add_argument(
|
|
@@ -491,11 +536,15 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
491
536
|
"clear-cache",
|
|
492
537
|
parents=[parent_parser],
|
|
493
538
|
description="Clear the local file cache.",
|
|
539
|
+
formatter_class=CustomHelpFormatter,
|
|
494
540
|
)
|
|
495
541
|
add_anon_arg(parse_clear_cache)
|
|
496
542
|
|
|
497
543
|
parse_gc = subp.add_parser(
|
|
498
|
-
"gc",
|
|
544
|
+
"gc",
|
|
545
|
+
parents=[parent_parser],
|
|
546
|
+
description="Garbage collect temporary tables.",
|
|
547
|
+
formatter_class=CustomHelpFormatter,
|
|
499
548
|
)
|
|
500
549
|
add_anon_arg(parse_gc)
|
|
501
550
|
|
|
@@ -510,6 +559,7 @@ def add_completion_parser(subparsers, parents):
|
|
|
510
559
|
"completion",
|
|
511
560
|
parents=parents,
|
|
512
561
|
description="Output shell completion script.",
|
|
562
|
+
formatter_class=CustomHelpFormatter,
|
|
513
563
|
)
|
|
514
564
|
parser.add_argument(
|
|
515
565
|
"-s",
|
datachain/cli/parser/job.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
|
+
from datachain.cli.parser.utils import CustomHelpFormatter
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
2
5
|
jobs_help = "Manage jobs in Studio"
|
|
3
6
|
jobs_description = "Commands to manage job execution in Studio."
|
|
4
7
|
jobs_parser = subparsers.add_parser(
|
|
5
|
-
"job",
|
|
8
|
+
"job",
|
|
9
|
+
parents=[parent_parser],
|
|
10
|
+
description=jobs_description,
|
|
11
|
+
help=jobs_help,
|
|
12
|
+
formatter_class=CustomHelpFormatter,
|
|
6
13
|
)
|
|
7
14
|
jobs_subparser = jobs_parser.add_subparsers(
|
|
8
15
|
dest="cmd",
|
|
@@ -17,10 +24,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
17
24
|
parents=[parent_parser],
|
|
18
25
|
description=studio_run_description,
|
|
19
26
|
help=studio_run_help,
|
|
27
|
+
formatter_class=CustomHelpFormatter,
|
|
20
28
|
)
|
|
21
29
|
|
|
22
30
|
studio_run_parser.add_argument(
|
|
23
|
-
"
|
|
31
|
+
"file",
|
|
24
32
|
action="store",
|
|
25
33
|
help="Query file to run",
|
|
26
34
|
)
|
|
@@ -78,10 +86,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
78
86
|
parents=[parent_parser],
|
|
79
87
|
description=studio_cancel_description,
|
|
80
88
|
help=studio_cancel_help,
|
|
89
|
+
formatter_class=CustomHelpFormatter,
|
|
81
90
|
)
|
|
82
91
|
|
|
83
92
|
studio_cancel_parser.add_argument(
|
|
84
|
-
"
|
|
93
|
+
"id",
|
|
85
94
|
action="store",
|
|
86
95
|
help="Job ID to cancel",
|
|
87
96
|
)
|
|
@@ -100,10 +109,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
100
109
|
parents=[parent_parser],
|
|
101
110
|
description=studio_log_description,
|
|
102
111
|
help=studio_log_help,
|
|
112
|
+
formatter_class=CustomHelpFormatter,
|
|
103
113
|
)
|
|
104
114
|
|
|
105
115
|
studio_log_parser.add_argument(
|
|
106
|
-
"
|
|
116
|
+
"id",
|
|
107
117
|
action="store",
|
|
108
118
|
help="Job ID to show logs for",
|
|
109
119
|
)
|
datachain/cli/parser/studio.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from datachain.cli.parser.utils import CustomHelpFormatter
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
def add_auth_parser(subparsers, parent_parser) -> None:
|
|
2
5
|
from dvc_studio_client.auth import AVAILABLE_SCOPES
|
|
3
6
|
|
|
@@ -9,6 +12,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
9
12
|
parents=[parent_parser],
|
|
10
13
|
description=auth_description,
|
|
11
14
|
help=auth_help,
|
|
15
|
+
formatter_class=CustomHelpFormatter,
|
|
12
16
|
)
|
|
13
17
|
auth_subparser = auth_parser.add_subparsers(
|
|
14
18
|
dest="cmd",
|
|
@@ -27,6 +31,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
27
31
|
parents=[parent_parser],
|
|
28
32
|
description=auth_login_description,
|
|
29
33
|
help=auth_login_help,
|
|
34
|
+
formatter_class=CustomHelpFormatter,
|
|
30
35
|
)
|
|
31
36
|
|
|
32
37
|
login_parser.add_argument(
|
|
@@ -69,6 +74,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
69
74
|
parents=[parent_parser],
|
|
70
75
|
description=auth_logout_description,
|
|
71
76
|
help=auth_logout_help,
|
|
77
|
+
formatter_class=CustomHelpFormatter,
|
|
72
78
|
)
|
|
73
79
|
|
|
74
80
|
auth_team_help = "Set default team for Studio operations"
|
|
@@ -79,6 +85,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
79
85
|
parents=[parent_parser],
|
|
80
86
|
description=auth_team_description,
|
|
81
87
|
help=auth_team_help,
|
|
88
|
+
formatter_class=CustomHelpFormatter,
|
|
82
89
|
)
|
|
83
90
|
team_parser.add_argument(
|
|
84
91
|
"team_name",
|
|
@@ -100,4 +107,5 @@ def add_auth_parser(subparsers, parent_parser) -> None:
|
|
|
100
107
|
parents=[parent_parser],
|
|
101
108
|
description=auth_token_description,
|
|
102
109
|
help=auth_token_help,
|
|
110
|
+
formatter_class=CustomHelpFormatter,
|
|
103
111
|
)
|
datachain/cli/parser/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from argparse import Action, ArgumentParser, ArgumentTypeError
|
|
1
|
+
from argparse import Action, ArgumentParser, ArgumentTypeError, HelpFormatter
|
|
2
2
|
from typing import Union
|
|
3
3
|
|
|
4
4
|
from datachain.cli.utils import CommaSeparatedArgs
|
|
@@ -6,6 +6,25 @@ from datachain.cli.utils import CommaSeparatedArgs
|
|
|
6
6
|
FIND_COLUMNS = ["du", "name", "path", "size", "type"]
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
class CustomHelpFormatter(HelpFormatter):
|
|
10
|
+
def add_arguments(self, actions):
|
|
11
|
+
# Sort arguments to move --help and --version to the end
|
|
12
|
+
normal_actions = [
|
|
13
|
+
a for a in actions if a.dest not in ("help", "verbose", "quiet")
|
|
14
|
+
]
|
|
15
|
+
special_actions = [a for a in actions if a.dest in ("help", "verbose", "quiet")]
|
|
16
|
+
super().add_arguments(normal_actions + special_actions)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CustomArgumentParser(ArgumentParser):
|
|
20
|
+
def error(self, message):
|
|
21
|
+
internal_commands = ["internal-run-udf", "internal-run-udf-worker"]
|
|
22
|
+
|
|
23
|
+
hidden_portion = "".join(f"'{cmd}', " for cmd in internal_commands)
|
|
24
|
+
message = message.replace(hidden_portion, "")
|
|
25
|
+
super().error(message)
|
|
26
|
+
|
|
27
|
+
|
|
9
28
|
def find_columns_type(
|
|
10
29
|
columns_str: str,
|
|
11
30
|
default_colums_str: str = "path",
|
datachain/dataset.py
CHANGED
|
@@ -181,7 +181,7 @@ class DatasetVersion:
|
|
|
181
181
|
|
|
182
182
|
@classmethod
|
|
183
183
|
def parse( # noqa: PLR0913
|
|
184
|
-
cls
|
|
184
|
+
cls,
|
|
185
185
|
id: int,
|
|
186
186
|
uuid: str,
|
|
187
187
|
dataset_id: int,
|
|
@@ -288,7 +288,7 @@ class DatasetListVersion:
|
|
|
288
288
|
|
|
289
289
|
@classmethod
|
|
290
290
|
def parse(
|
|
291
|
-
cls
|
|
291
|
+
cls,
|
|
292
292
|
id: int,
|
|
293
293
|
uuid: str,
|
|
294
294
|
dataset_id: int,
|
|
@@ -352,7 +352,7 @@ class DatasetRecord:
|
|
|
352
352
|
|
|
353
353
|
@classmethod
|
|
354
354
|
def parse( # noqa: PLR0913
|
|
355
|
-
cls
|
|
355
|
+
cls,
|
|
356
356
|
id: int,
|
|
357
357
|
name: str,
|
|
358
358
|
description: Optional[str],
|
|
@@ -567,7 +567,7 @@ class DatasetListRecord:
|
|
|
567
567
|
|
|
568
568
|
@classmethod
|
|
569
569
|
def parse( # noqa: PLR0913
|
|
570
|
-
cls
|
|
570
|
+
cls,
|
|
571
571
|
id: int,
|
|
572
572
|
name: str,
|
|
573
573
|
description: Optional[str],
|
|
@@ -628,6 +628,9 @@ class DatasetListRecord:
|
|
|
628
628
|
self.versions.sort(key=lambda v: v.version)
|
|
629
629
|
return self
|
|
630
630
|
|
|
631
|
+
def latest_version(self) -> DatasetListVersion:
|
|
632
|
+
return max(self.versions, key=lambda v: v.version)
|
|
633
|
+
|
|
631
634
|
@property
|
|
632
635
|
def is_bucket_listing(self) -> bool:
|
|
633
636
|
"""
|