datachain 0.8.3__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +16 -6
- datachain/cache.py +32 -10
- datachain/catalog/catalog.py +17 -1
- datachain/cli/__init__.py +311 -0
- datachain/cli/commands/__init__.py +29 -0
- datachain/cli/commands/datasets.py +129 -0
- datachain/cli/commands/du.py +14 -0
- datachain/cli/commands/index.py +12 -0
- datachain/cli/commands/ls.py +169 -0
- datachain/cli/commands/misc.py +28 -0
- datachain/cli/commands/query.py +53 -0
- datachain/cli/commands/show.py +38 -0
- datachain/cli/parser/__init__.py +547 -0
- datachain/cli/parser/job.py +120 -0
- datachain/cli/parser/studio.py +126 -0
- datachain/cli/parser/utils.py +63 -0
- datachain/{cli_utils.py → cli/utils.py} +27 -1
- datachain/client/azure.py +6 -2
- datachain/client/fsspec.py +9 -3
- datachain/client/gcs.py +6 -2
- datachain/client/s3.py +16 -1
- datachain/data_storage/db_engine.py +9 -0
- datachain/data_storage/schema.py +4 -10
- datachain/data_storage/sqlite.py +7 -1
- datachain/data_storage/warehouse.py +6 -4
- datachain/{lib/diff.py → diff/__init__.py} +116 -12
- datachain/func/__init__.py +3 -2
- datachain/func/conditional.py +74 -0
- datachain/func/func.py +5 -1
- datachain/lib/arrow.py +7 -1
- datachain/lib/dc.py +8 -3
- datachain/lib/file.py +16 -5
- datachain/lib/hf.py +1 -1
- datachain/lib/listing.py +19 -1
- datachain/lib/pytorch.py +57 -13
- datachain/lib/signal_schema.py +89 -27
- datachain/lib/udf.py +82 -40
- datachain/listing.py +1 -0
- datachain/progress.py +20 -3
- datachain/query/dataset.py +122 -93
- datachain/query/dispatch.py +22 -16
- datachain/studio.py +58 -38
- datachain/utils.py +14 -3
- {datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/METADATA +9 -9
- {datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/RECORD +49 -37
- {datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/WHEEL +1 -1
- datachain/cli.py +0 -1475
- {datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/LICENSE +0 -0
- {datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.3.dist-info → datachain-0.8.5.dist-info}/top_level.txt +0 -0
datachain/cli.py
DELETED
|
@@ -1,1475 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
import shlex
|
|
4
|
-
import sys
|
|
5
|
-
import traceback
|
|
6
|
-
from argparse import Action, ArgumentParser, ArgumentTypeError, Namespace
|
|
7
|
-
from collections.abc import Iterable, Iterator, Sequence
|
|
8
|
-
from importlib.metadata import PackageNotFoundError, version
|
|
9
|
-
from itertools import chain
|
|
10
|
-
from multiprocessing import freeze_support
|
|
11
|
-
from typing import TYPE_CHECKING, Optional, Union
|
|
12
|
-
|
|
13
|
-
import shtab
|
|
14
|
-
from tabulate import tabulate
|
|
15
|
-
|
|
16
|
-
from datachain import Session, utils
|
|
17
|
-
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
18
|
-
from datachain.config import Config
|
|
19
|
-
from datachain.error import DataChainError, DatasetNotFoundError
|
|
20
|
-
from datachain.lib.dc import DataChain
|
|
21
|
-
from datachain.studio import (
|
|
22
|
-
edit_studio_dataset,
|
|
23
|
-
list_datasets,
|
|
24
|
-
process_studio_cli_args,
|
|
25
|
-
remove_studio_dataset,
|
|
26
|
-
)
|
|
27
|
-
from datachain.telemetry import telemetry
|
|
28
|
-
|
|
29
|
-
if TYPE_CHECKING:
|
|
30
|
-
from datachain.catalog import Catalog
|
|
31
|
-
|
|
32
|
-
logger = logging.getLogger("datachain")
|
|
33
|
-
|
|
34
|
-
TTL_HUMAN = "4h"
|
|
35
|
-
TTL_INT = 4 * 60 * 60
|
|
36
|
-
FIND_COLUMNS = ["du", "name", "path", "size", "type"]
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def human_time_type(value_str: str, can_be_none: bool = False) -> Optional[int]:
|
|
40
|
-
value = utils.human_time_to_int(value_str)
|
|
41
|
-
|
|
42
|
-
if value:
|
|
43
|
-
return value
|
|
44
|
-
if can_be_none:
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
raise ArgumentTypeError(
|
|
48
|
-
"This option supports only a human-readable time interval like 12h or 4w."
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def parse_find_column(column: str) -> str:
|
|
53
|
-
column_lower = column.strip().lower()
|
|
54
|
-
if column_lower in FIND_COLUMNS:
|
|
55
|
-
return column_lower
|
|
56
|
-
raise ArgumentTypeError(
|
|
57
|
-
f"Invalid column for find: '{column}' Options are: {','.join(FIND_COLUMNS)}"
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def find_columns_type(
|
|
62
|
-
columns_str: str,
|
|
63
|
-
default_colums_str: str = "path",
|
|
64
|
-
) -> list[str]:
|
|
65
|
-
if not columns_str:
|
|
66
|
-
columns_str = default_colums_str
|
|
67
|
-
|
|
68
|
-
return [parse_find_column(c) for c in columns_str.split(",")]
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Action:
|
|
72
|
-
return parser.add_argument(
|
|
73
|
-
"sources",
|
|
74
|
-
type=str,
|
|
75
|
-
nargs=nargs,
|
|
76
|
-
help="Data sources - paths to cloud storage dirs",
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def add_show_args(parser: ArgumentParser) -> None:
|
|
81
|
-
parser.add_argument(
|
|
82
|
-
"--limit",
|
|
83
|
-
action="store",
|
|
84
|
-
default=10,
|
|
85
|
-
type=int,
|
|
86
|
-
help="Number of rows to show",
|
|
87
|
-
)
|
|
88
|
-
parser.add_argument(
|
|
89
|
-
"--offset",
|
|
90
|
-
action="store",
|
|
91
|
-
default=0,
|
|
92
|
-
type=int,
|
|
93
|
-
help="Number of rows to offset",
|
|
94
|
-
)
|
|
95
|
-
parser.add_argument(
|
|
96
|
-
"--columns",
|
|
97
|
-
default=[],
|
|
98
|
-
action=CommaSeparatedArgs,
|
|
99
|
-
help="Columns to show",
|
|
100
|
-
)
|
|
101
|
-
parser.add_argument(
|
|
102
|
-
"--no-collapse",
|
|
103
|
-
action="store_true",
|
|
104
|
-
default=False,
|
|
105
|
-
help="Do not collapse the columns",
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def add_studio_parser(subparsers, parent_parser) -> None:
|
|
110
|
-
studio_help = "Commands to authenticate DataChain with Iterative Studio"
|
|
111
|
-
studio_description = (
|
|
112
|
-
"Authenticate DataChain with Studio and set the token. "
|
|
113
|
-
"Once this token has been properly configured,\n"
|
|
114
|
-
"DataChain will utilize it for seamlessly sharing datasets\n"
|
|
115
|
-
"and using Studio features from CLI"
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
studio_parser = subparsers.add_parser(
|
|
119
|
-
"studio",
|
|
120
|
-
parents=[parent_parser],
|
|
121
|
-
description=studio_description,
|
|
122
|
-
help=studio_help,
|
|
123
|
-
)
|
|
124
|
-
studio_subparser = studio_parser.add_subparsers(
|
|
125
|
-
dest="cmd",
|
|
126
|
-
help="Use `DataChain studio CMD --help` to display command-specific help.",
|
|
127
|
-
required=True,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
studio_login_help = "Authenticate DataChain with Studio host"
|
|
131
|
-
studio_login_description = (
|
|
132
|
-
"By default, this command authenticates the DataChain with Studio\n"
|
|
133
|
-
"using default scopes and assigns a random name as the token name."
|
|
134
|
-
)
|
|
135
|
-
login_parser = studio_subparser.add_parser(
|
|
136
|
-
"login",
|
|
137
|
-
parents=[parent_parser],
|
|
138
|
-
description=studio_login_description,
|
|
139
|
-
help=studio_login_help,
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
login_parser.add_argument(
|
|
143
|
-
"-H",
|
|
144
|
-
"--hostname",
|
|
145
|
-
action="store",
|
|
146
|
-
default=None,
|
|
147
|
-
help="The hostname of the Studio instance to authenticate with.",
|
|
148
|
-
)
|
|
149
|
-
login_parser.add_argument(
|
|
150
|
-
"-s",
|
|
151
|
-
"--scopes",
|
|
152
|
-
action="store",
|
|
153
|
-
default=None,
|
|
154
|
-
help="The scopes for the authentication token. ",
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
login_parser.add_argument(
|
|
158
|
-
"-n",
|
|
159
|
-
"--name",
|
|
160
|
-
action="store",
|
|
161
|
-
default=None,
|
|
162
|
-
help="The name of the authentication token. It will be used to\n"
|
|
163
|
-
"identify token shown in Studio profile.",
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
login_parser.add_argument(
|
|
167
|
-
"--no-open",
|
|
168
|
-
action="store_true",
|
|
169
|
-
default=False,
|
|
170
|
-
help="Use authentication flow based on user code.\n"
|
|
171
|
-
"You will be presented with user code to enter in browser.\n"
|
|
172
|
-
"DataChain will also use this if it cannot launch browser on your behalf.",
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
studio_logout_help = "Logout user from Studio"
|
|
176
|
-
studio_logout_description = "This removes the studio token from your global config."
|
|
177
|
-
|
|
178
|
-
studio_subparser.add_parser(
|
|
179
|
-
"logout",
|
|
180
|
-
parents=[parent_parser],
|
|
181
|
-
description=studio_logout_description,
|
|
182
|
-
help=studio_logout_help,
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
studio_team_help = "Set the default team for DataChain"
|
|
186
|
-
studio_team_description = (
|
|
187
|
-
"Set the default team for DataChain to use when interacting with Studio."
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
team_parser = studio_subparser.add_parser(
|
|
191
|
-
"team",
|
|
192
|
-
parents=[parent_parser],
|
|
193
|
-
description=studio_team_description,
|
|
194
|
-
help=studio_team_help,
|
|
195
|
-
)
|
|
196
|
-
team_parser.add_argument(
|
|
197
|
-
"team_name",
|
|
198
|
-
action="store",
|
|
199
|
-
help="The name of the team to set as the default.",
|
|
200
|
-
)
|
|
201
|
-
team_parser.add_argument(
|
|
202
|
-
"--global",
|
|
203
|
-
action="store_true",
|
|
204
|
-
default=False,
|
|
205
|
-
help="Set the team globally for all DataChain projects.",
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
studio_token_help = "View the token datachain uses to contact Studio" # noqa: S105 # nosec B105
|
|
209
|
-
|
|
210
|
-
studio_subparser.add_parser(
|
|
211
|
-
"token",
|
|
212
|
-
parents=[parent_parser],
|
|
213
|
-
description=studio_token_help,
|
|
214
|
-
help=studio_token_help,
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
studio_ls_dataset_help = "List the available datasets from Studio"
|
|
218
|
-
studio_ls_dataset_description = (
|
|
219
|
-
"This command lists all the datasets available in Studio.\n"
|
|
220
|
-
"It will show the dataset name and the number of versions available."
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
ls_dataset_parser = studio_subparser.add_parser(
|
|
224
|
-
"datasets",
|
|
225
|
-
parents=[parent_parser],
|
|
226
|
-
description=studio_ls_dataset_description,
|
|
227
|
-
help=studio_ls_dataset_help,
|
|
228
|
-
)
|
|
229
|
-
ls_dataset_parser.add_argument(
|
|
230
|
-
"--team",
|
|
231
|
-
action="store",
|
|
232
|
-
default=None,
|
|
233
|
-
help="The team to list datasets for. By default, it will use team from config.",
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
studio_run_help = "Run a job in Studio"
|
|
237
|
-
studio_run_description = "This command runs a job in Studio."
|
|
238
|
-
|
|
239
|
-
studio_run_parser = studio_subparser.add_parser(
|
|
240
|
-
"run",
|
|
241
|
-
parents=[parent_parser],
|
|
242
|
-
description=studio_run_description,
|
|
243
|
-
help=studio_run_help,
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
studio_run_parser.add_argument(
|
|
247
|
-
"query_file",
|
|
248
|
-
action="store",
|
|
249
|
-
help="The query file to run.",
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
studio_run_parser.add_argument(
|
|
253
|
-
"--team",
|
|
254
|
-
action="store",
|
|
255
|
-
default=None,
|
|
256
|
-
help="The team to run a job for. By default, it will use team from config.",
|
|
257
|
-
)
|
|
258
|
-
studio_run_parser.add_argument(
|
|
259
|
-
"--env-file",
|
|
260
|
-
action="store",
|
|
261
|
-
help="File containing environment variables to set for the job.",
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
studio_run_parser.add_argument(
|
|
265
|
-
"--env",
|
|
266
|
-
nargs="+",
|
|
267
|
-
help="Environment variable. Can be specified multiple times. Format: KEY=VALUE",
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
studio_run_parser.add_argument(
|
|
271
|
-
"--workers",
|
|
272
|
-
type=int,
|
|
273
|
-
help="Number of workers to use for the job.",
|
|
274
|
-
)
|
|
275
|
-
studio_run_parser.add_argument(
|
|
276
|
-
"--files",
|
|
277
|
-
nargs="+",
|
|
278
|
-
help="Files to include in the job.",
|
|
279
|
-
)
|
|
280
|
-
studio_run_parser.add_argument(
|
|
281
|
-
"--python-version",
|
|
282
|
-
action="store",
|
|
283
|
-
help="Python version to use for the job (e.g. '3.9', '3.10', '3.11').",
|
|
284
|
-
)
|
|
285
|
-
studio_run_parser.add_argument(
|
|
286
|
-
"--req-file",
|
|
287
|
-
action="store",
|
|
288
|
-
help="File containing Python package requirements.",
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
studio_run_parser.add_argument(
|
|
292
|
-
"--req",
|
|
293
|
-
nargs="+",
|
|
294
|
-
help="Python package requirement. Can be specified multiple times.",
|
|
295
|
-
)
|
|
296
|
-
|
|
297
|
-
studio_cancel_help = "Cancel a job in Studio"
|
|
298
|
-
studio_cancel_description = "This command cancels a job in Studio."
|
|
299
|
-
|
|
300
|
-
studio_cancel_parser = studio_subparser.add_parser(
|
|
301
|
-
"cancel",
|
|
302
|
-
parents=[parent_parser],
|
|
303
|
-
description=studio_cancel_description,
|
|
304
|
-
help=studio_cancel_help,
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
studio_cancel_parser.add_argument(
|
|
308
|
-
"job_id",
|
|
309
|
-
action="store",
|
|
310
|
-
help="The job ID to cancel.",
|
|
311
|
-
)
|
|
312
|
-
studio_cancel_parser.add_argument(
|
|
313
|
-
"--team",
|
|
314
|
-
action="store",
|
|
315
|
-
default=None,
|
|
316
|
-
help="The team to cancel a job for. By default, it will use team from config.",
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
321
|
-
try:
|
|
322
|
-
__version__ = version("datachain")
|
|
323
|
-
except PackageNotFoundError:
|
|
324
|
-
# package is not installed
|
|
325
|
-
__version__ = "unknown"
|
|
326
|
-
|
|
327
|
-
parser = ArgumentParser(
|
|
328
|
-
description="DataChain: Wrangle unstructured AI data at scale", prog="datachain"
|
|
329
|
-
)
|
|
330
|
-
parser.add_argument("-V", "--version", action="version", version=__version__)
|
|
331
|
-
|
|
332
|
-
parent_parser = ArgumentParser(add_help=False)
|
|
333
|
-
parent_parser.add_argument(
|
|
334
|
-
"--aws-endpoint-url",
|
|
335
|
-
type=str,
|
|
336
|
-
help="AWS endpoint URL",
|
|
337
|
-
)
|
|
338
|
-
parent_parser.add_argument(
|
|
339
|
-
"--anon",
|
|
340
|
-
action="store_true",
|
|
341
|
-
help="AWS anon (aka awscli's --no-sign-request)",
|
|
342
|
-
)
|
|
343
|
-
parent_parser.add_argument(
|
|
344
|
-
"-u", "--update", action="count", default=0, help="Update cache"
|
|
345
|
-
)
|
|
346
|
-
parent_parser.add_argument(
|
|
347
|
-
"-v", "--verbose", action="count", default=0, help="Verbose"
|
|
348
|
-
)
|
|
349
|
-
parent_parser.add_argument(
|
|
350
|
-
"-q", "--quiet", action="count", default=0, help="Be quiet"
|
|
351
|
-
)
|
|
352
|
-
parent_parser.add_argument(
|
|
353
|
-
"--debug-sql",
|
|
354
|
-
action="store_true",
|
|
355
|
-
default=False,
|
|
356
|
-
help="Show All SQL Queries (very verbose output, for debugging only)",
|
|
357
|
-
)
|
|
358
|
-
parent_parser.add_argument(
|
|
359
|
-
"--pdb",
|
|
360
|
-
action="store_true",
|
|
361
|
-
default=False,
|
|
362
|
-
help="Drop into the pdb debugger on fatal exception",
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
subp = parser.add_subparsers(
|
|
366
|
-
title="Available Commands",
|
|
367
|
-
metavar="command",
|
|
368
|
-
dest="command",
|
|
369
|
-
help=f"Use `{parser.prog} command --help` for command-specific help.",
|
|
370
|
-
required=True,
|
|
371
|
-
)
|
|
372
|
-
parse_cp = subp.add_parser(
|
|
373
|
-
"cp", parents=[parent_parser], description="Copy data files from the cloud"
|
|
374
|
-
)
|
|
375
|
-
add_sources_arg(parse_cp).complete = shtab.DIR # type: ignore[attr-defined]
|
|
376
|
-
parse_cp.add_argument("output", type=str, help="Output")
|
|
377
|
-
parse_cp.add_argument(
|
|
378
|
-
"-f",
|
|
379
|
-
"--force",
|
|
380
|
-
default=False,
|
|
381
|
-
action="store_true",
|
|
382
|
-
help="Force creating outputs",
|
|
383
|
-
)
|
|
384
|
-
parse_cp.add_argument(
|
|
385
|
-
"-r",
|
|
386
|
-
"-R",
|
|
387
|
-
"--recursive",
|
|
388
|
-
default=False,
|
|
389
|
-
action="store_true",
|
|
390
|
-
help="Copy directories recursively",
|
|
391
|
-
)
|
|
392
|
-
parse_cp.add_argument(
|
|
393
|
-
"--no-glob",
|
|
394
|
-
default=False,
|
|
395
|
-
action="store_true",
|
|
396
|
-
help="Do not expand globs (such as * or ?)",
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
parse_clone = subp.add_parser(
|
|
400
|
-
"clone", parents=[parent_parser], description="Copy data files from the cloud"
|
|
401
|
-
)
|
|
402
|
-
add_sources_arg(parse_clone).complete = shtab.DIR # type: ignore[attr-defined]
|
|
403
|
-
parse_clone.add_argument("output", type=str, help="Output")
|
|
404
|
-
parse_clone.add_argument(
|
|
405
|
-
"-f",
|
|
406
|
-
"--force",
|
|
407
|
-
default=False,
|
|
408
|
-
action="store_true",
|
|
409
|
-
help="Force creating outputs",
|
|
410
|
-
)
|
|
411
|
-
parse_clone.add_argument(
|
|
412
|
-
"-r",
|
|
413
|
-
"-R",
|
|
414
|
-
"--recursive",
|
|
415
|
-
default=False,
|
|
416
|
-
action="store_true",
|
|
417
|
-
help="Copy directories recursively",
|
|
418
|
-
)
|
|
419
|
-
parse_clone.add_argument(
|
|
420
|
-
"--no-glob",
|
|
421
|
-
default=False,
|
|
422
|
-
action="store_true",
|
|
423
|
-
help="Do not expand globs (such as * or ?)",
|
|
424
|
-
)
|
|
425
|
-
parse_clone.add_argument(
|
|
426
|
-
"--no-cp",
|
|
427
|
-
default=False,
|
|
428
|
-
action="store_true",
|
|
429
|
-
help="Do not copy files, just create a dataset",
|
|
430
|
-
)
|
|
431
|
-
parse_clone.add_argument(
|
|
432
|
-
"--edatachain",
|
|
433
|
-
default=False,
|
|
434
|
-
action="store_true",
|
|
435
|
-
help="Create a .edatachain file",
|
|
436
|
-
)
|
|
437
|
-
parse_clone.add_argument(
|
|
438
|
-
"--edatachain-file",
|
|
439
|
-
help="Use a different filename for the resulting .edatachain file",
|
|
440
|
-
)
|
|
441
|
-
|
|
442
|
-
add_studio_parser(subp, parent_parser)
|
|
443
|
-
|
|
444
|
-
datasets_parser = subp.add_parser(
|
|
445
|
-
"datasets",
|
|
446
|
-
aliases=["ds"],
|
|
447
|
-
parents=[parent_parser],
|
|
448
|
-
description="Commands for managing datasers",
|
|
449
|
-
)
|
|
450
|
-
datasets_subparser = datasets_parser.add_subparsers(
|
|
451
|
-
dest="datasets_cmd",
|
|
452
|
-
help="Use `datachain datasets CMD --help` to display command specific help",
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
parse_pull = datasets_subparser.add_parser(
|
|
456
|
-
"pull",
|
|
457
|
-
parents=[parent_parser],
|
|
458
|
-
description="Pull specific dataset version from SaaS",
|
|
459
|
-
)
|
|
460
|
-
parse_pull.add_argument(
|
|
461
|
-
"dataset",
|
|
462
|
-
type=str,
|
|
463
|
-
help="Name and version of remote dataset created in SaaS",
|
|
464
|
-
)
|
|
465
|
-
parse_pull.add_argument("-o", "--output", type=str, help="Output")
|
|
466
|
-
parse_pull.add_argument(
|
|
467
|
-
"-f",
|
|
468
|
-
"--force",
|
|
469
|
-
default=False,
|
|
470
|
-
action="store_true",
|
|
471
|
-
help="Force creating outputs",
|
|
472
|
-
)
|
|
473
|
-
parse_pull.add_argument(
|
|
474
|
-
"-r",
|
|
475
|
-
"-R",
|
|
476
|
-
"--recursive",
|
|
477
|
-
default=False,
|
|
478
|
-
action="store_true",
|
|
479
|
-
help="Copy directories recursively",
|
|
480
|
-
)
|
|
481
|
-
parse_pull.add_argument(
|
|
482
|
-
"--cp",
|
|
483
|
-
default=False,
|
|
484
|
-
action="store_true",
|
|
485
|
-
help="Copy actual files after pulling remote dataset into local DB",
|
|
486
|
-
)
|
|
487
|
-
parse_pull.add_argument(
|
|
488
|
-
"--edatachain",
|
|
489
|
-
default=False,
|
|
490
|
-
action="store_true",
|
|
491
|
-
help="Create .edatachain file",
|
|
492
|
-
)
|
|
493
|
-
parse_pull.add_argument(
|
|
494
|
-
"--edatachain-file",
|
|
495
|
-
help="Use a different filename for the resulting .edatachain file",
|
|
496
|
-
)
|
|
497
|
-
parse_pull.add_argument(
|
|
498
|
-
"--local-name",
|
|
499
|
-
action="store",
|
|
500
|
-
default=None,
|
|
501
|
-
help="Name of the local dataset",
|
|
502
|
-
)
|
|
503
|
-
parse_pull.add_argument(
|
|
504
|
-
"--local-version",
|
|
505
|
-
action="store",
|
|
506
|
-
default=None,
|
|
507
|
-
help="Version of the local dataset",
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
parse_edit_dataset = datasets_subparser.add_parser(
|
|
511
|
-
"edit", parents=[parent_parser], description="Edit dataset metadata"
|
|
512
|
-
)
|
|
513
|
-
parse_edit_dataset.add_argument("name", type=str, help="Dataset name")
|
|
514
|
-
parse_edit_dataset.add_argument(
|
|
515
|
-
"--new-name",
|
|
516
|
-
action="store",
|
|
517
|
-
help="Dataset new name",
|
|
518
|
-
)
|
|
519
|
-
parse_edit_dataset.add_argument(
|
|
520
|
-
"--description",
|
|
521
|
-
action="store",
|
|
522
|
-
help="Dataset description",
|
|
523
|
-
)
|
|
524
|
-
parse_edit_dataset.add_argument(
|
|
525
|
-
"--labels",
|
|
526
|
-
nargs="+",
|
|
527
|
-
help="Dataset labels",
|
|
528
|
-
)
|
|
529
|
-
parse_edit_dataset.add_argument(
|
|
530
|
-
"--studio",
|
|
531
|
-
action="store_true",
|
|
532
|
-
default=False,
|
|
533
|
-
help="Edit dataset from Studio",
|
|
534
|
-
)
|
|
535
|
-
parse_edit_dataset.add_argument(
|
|
536
|
-
"-L",
|
|
537
|
-
"--local",
|
|
538
|
-
action="store_true",
|
|
539
|
-
default=False,
|
|
540
|
-
help="Edit local dataset only",
|
|
541
|
-
)
|
|
542
|
-
parse_edit_dataset.add_argument(
|
|
543
|
-
"-a",
|
|
544
|
-
"--all",
|
|
545
|
-
action="store_true",
|
|
546
|
-
default=True,
|
|
547
|
-
help="Edit both datasets from studio and local",
|
|
548
|
-
)
|
|
549
|
-
parse_edit_dataset.add_argument(
|
|
550
|
-
"--team",
|
|
551
|
-
action="store",
|
|
552
|
-
default=None,
|
|
553
|
-
help="The team to edit a dataset. By default, it will use team from config.",
|
|
554
|
-
)
|
|
555
|
-
|
|
556
|
-
datasets_parser = datasets_subparser.add_parser(
|
|
557
|
-
"ls", parents=[parent_parser], description="List datasets"
|
|
558
|
-
)
|
|
559
|
-
datasets_parser.add_argument(
|
|
560
|
-
"--studio",
|
|
561
|
-
action="store_true",
|
|
562
|
-
default=False,
|
|
563
|
-
help="List the files in the Studio",
|
|
564
|
-
)
|
|
565
|
-
datasets_parser.add_argument(
|
|
566
|
-
"-L",
|
|
567
|
-
"--local",
|
|
568
|
-
action="store_true",
|
|
569
|
-
default=False,
|
|
570
|
-
help="List local files only",
|
|
571
|
-
)
|
|
572
|
-
datasets_parser.add_argument(
|
|
573
|
-
"-a",
|
|
574
|
-
"--all",
|
|
575
|
-
action="store_true",
|
|
576
|
-
default=True,
|
|
577
|
-
help="List all files including hidden files",
|
|
578
|
-
)
|
|
579
|
-
datasets_parser.add_argument(
|
|
580
|
-
"--team",
|
|
581
|
-
action="store",
|
|
582
|
-
default=None,
|
|
583
|
-
help="The team to list datasets for. By default, it will use team from config.",
|
|
584
|
-
)
|
|
585
|
-
|
|
586
|
-
rm_dataset_parser = datasets_subparser.add_parser(
|
|
587
|
-
"rm", parents=[parent_parser], description="Removes dataset", aliases=["remove"]
|
|
588
|
-
)
|
|
589
|
-
rm_dataset_parser.add_argument("name", type=str, help="Dataset name")
|
|
590
|
-
rm_dataset_parser.add_argument(
|
|
591
|
-
"--version",
|
|
592
|
-
action="store",
|
|
593
|
-
default=None,
|
|
594
|
-
type=int,
|
|
595
|
-
help="Dataset version",
|
|
596
|
-
)
|
|
597
|
-
rm_dataset_parser.add_argument(
|
|
598
|
-
"--force",
|
|
599
|
-
default=False,
|
|
600
|
-
action=BooleanOptionalAction,
|
|
601
|
-
help="Force delete registered dataset with all of it's versions",
|
|
602
|
-
)
|
|
603
|
-
rm_dataset_parser.add_argument(
|
|
604
|
-
"--studio",
|
|
605
|
-
action="store_true",
|
|
606
|
-
default=False,
|
|
607
|
-
help="Remove dataset from Studio",
|
|
608
|
-
)
|
|
609
|
-
rm_dataset_parser.add_argument(
|
|
610
|
-
"-L",
|
|
611
|
-
"--local",
|
|
612
|
-
action="store_true",
|
|
613
|
-
default=False,
|
|
614
|
-
help="Remove local datasets only",
|
|
615
|
-
)
|
|
616
|
-
rm_dataset_parser.add_argument(
|
|
617
|
-
"-a",
|
|
618
|
-
"--all",
|
|
619
|
-
action="store_true",
|
|
620
|
-
default=True,
|
|
621
|
-
help="Remove both local and studio",
|
|
622
|
-
)
|
|
623
|
-
rm_dataset_parser.add_argument(
|
|
624
|
-
"--team",
|
|
625
|
-
action="store",
|
|
626
|
-
default=None,
|
|
627
|
-
help="The team to delete a dataset. By default, it will use team from config.",
|
|
628
|
-
)
|
|
629
|
-
|
|
630
|
-
dataset_stats_parser = datasets_subparser.add_parser(
|
|
631
|
-
"stats",
|
|
632
|
-
parents=[parent_parser],
|
|
633
|
-
description="Shows basic dataset stats",
|
|
634
|
-
)
|
|
635
|
-
dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
|
|
636
|
-
dataset_stats_parser.add_argument(
|
|
637
|
-
"--version",
|
|
638
|
-
action="store",
|
|
639
|
-
default=None,
|
|
640
|
-
type=int,
|
|
641
|
-
help="Dataset version",
|
|
642
|
-
)
|
|
643
|
-
dataset_stats_parser.add_argument(
|
|
644
|
-
"-b",
|
|
645
|
-
"--bytes",
|
|
646
|
-
default=False,
|
|
647
|
-
action="store_true",
|
|
648
|
-
help="Display size in bytes instead of human-readable size",
|
|
649
|
-
)
|
|
650
|
-
dataset_stats_parser.add_argument(
|
|
651
|
-
"--si",
|
|
652
|
-
default=False,
|
|
653
|
-
action="store_true",
|
|
654
|
-
help="Display size using powers of 1000 not 1024",
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
parse_ls = subp.add_parser(
|
|
658
|
-
"ls", parents=[parent_parser], description="List storage contents"
|
|
659
|
-
)
|
|
660
|
-
add_sources_arg(parse_ls, nargs="*")
|
|
661
|
-
parse_ls.add_argument(
|
|
662
|
-
"-l",
|
|
663
|
-
"--long",
|
|
664
|
-
action="count",
|
|
665
|
-
default=0,
|
|
666
|
-
help="List files in the long format",
|
|
667
|
-
)
|
|
668
|
-
parse_ls.add_argument(
|
|
669
|
-
"--studio",
|
|
670
|
-
action="store_true",
|
|
671
|
-
default=False,
|
|
672
|
-
help="List the files in the Studio",
|
|
673
|
-
)
|
|
674
|
-
parse_ls.add_argument(
|
|
675
|
-
"-L",
|
|
676
|
-
"--local",
|
|
677
|
-
action="store_true",
|
|
678
|
-
default=False,
|
|
679
|
-
help="List local files only",
|
|
680
|
-
)
|
|
681
|
-
parse_ls.add_argument(
|
|
682
|
-
"-a",
|
|
683
|
-
"--all",
|
|
684
|
-
action="store_true",
|
|
685
|
-
default=True,
|
|
686
|
-
help="List all files including hidden files",
|
|
687
|
-
)
|
|
688
|
-
parse_ls.add_argument(
|
|
689
|
-
"--team",
|
|
690
|
-
action="store",
|
|
691
|
-
default=None,
|
|
692
|
-
help="The team to list datasets for. By default, it will use team from config.",
|
|
693
|
-
)
|
|
694
|
-
|
|
695
|
-
parse_du = subp.add_parser(
|
|
696
|
-
"du", parents=[parent_parser], description="Display space usage"
|
|
697
|
-
)
|
|
698
|
-
add_sources_arg(parse_du)
|
|
699
|
-
parse_du.add_argument(
|
|
700
|
-
"-b",
|
|
701
|
-
"--bytes",
|
|
702
|
-
default=False,
|
|
703
|
-
action="store_true",
|
|
704
|
-
help="Display sizes in bytes instead of human-readable sizes",
|
|
705
|
-
)
|
|
706
|
-
parse_du.add_argument(
|
|
707
|
-
"-d",
|
|
708
|
-
"--depth",
|
|
709
|
-
"--max-depth",
|
|
710
|
-
default=0,
|
|
711
|
-
type=int,
|
|
712
|
-
metavar="N",
|
|
713
|
-
help=(
|
|
714
|
-
"Display sizes for N directory depths below the given directory, "
|
|
715
|
-
"the default is 0 (summarize provided directory only)."
|
|
716
|
-
),
|
|
717
|
-
)
|
|
718
|
-
parse_du.add_argument(
|
|
719
|
-
"--si",
|
|
720
|
-
default=False,
|
|
721
|
-
action="store_true",
|
|
722
|
-
help="Display sizes using powers of 1000 not 1024",
|
|
723
|
-
)
|
|
724
|
-
|
|
725
|
-
parse_find = subp.add_parser(
|
|
726
|
-
"find", parents=[parent_parser], description="Search in a directory hierarchy"
|
|
727
|
-
)
|
|
728
|
-
add_sources_arg(parse_find)
|
|
729
|
-
parse_find.add_argument(
|
|
730
|
-
"--name",
|
|
731
|
-
type=str,
|
|
732
|
-
action="append",
|
|
733
|
-
help="Filename to match pattern.",
|
|
734
|
-
)
|
|
735
|
-
parse_find.add_argument(
|
|
736
|
-
"--iname",
|
|
737
|
-
type=str,
|
|
738
|
-
action="append",
|
|
739
|
-
help="Like -name but case insensitive.",
|
|
740
|
-
)
|
|
741
|
-
parse_find.add_argument(
|
|
742
|
-
"--path",
|
|
743
|
-
type=str,
|
|
744
|
-
action="append",
|
|
745
|
-
help="Path to match pattern.",
|
|
746
|
-
)
|
|
747
|
-
parse_find.add_argument(
|
|
748
|
-
"--ipath",
|
|
749
|
-
type=str,
|
|
750
|
-
action="append",
|
|
751
|
-
help="Like -path but case insensitive.",
|
|
752
|
-
)
|
|
753
|
-
parse_find.add_argument(
|
|
754
|
-
"--size",
|
|
755
|
-
type=str,
|
|
756
|
-
help=(
|
|
757
|
-
"Filter by size (+ is greater or equal, - is less or equal). "
|
|
758
|
-
"Specified size is in bytes, or use a suffix like K, M, G for "
|
|
759
|
-
"kilobytes, megabytes, gigabytes, etc."
|
|
760
|
-
),
|
|
761
|
-
)
|
|
762
|
-
parse_find.add_argument(
|
|
763
|
-
"--type",
|
|
764
|
-
type=str,
|
|
765
|
-
help='File type: "f" - regular, "d" - directory',
|
|
766
|
-
)
|
|
767
|
-
parse_find.add_argument(
|
|
768
|
-
"-c",
|
|
769
|
-
"--columns",
|
|
770
|
-
type=find_columns_type,
|
|
771
|
-
default=None,
|
|
772
|
-
help=(
|
|
773
|
-
"A comma-separated list of columns to print for each result. "
|
|
774
|
-
f"Options are: {','.join(FIND_COLUMNS)} (Default: path)"
|
|
775
|
-
),
|
|
776
|
-
)
|
|
777
|
-
|
|
778
|
-
parse_index = subp.add_parser(
|
|
779
|
-
"index", parents=[parent_parser], description="Index storage location"
|
|
780
|
-
)
|
|
781
|
-
add_sources_arg(parse_index)
|
|
782
|
-
|
|
783
|
-
show_parser = subp.add_parser(
|
|
784
|
-
"show",
|
|
785
|
-
parents=[parent_parser],
|
|
786
|
-
description="Create a new dataset with a query script",
|
|
787
|
-
)
|
|
788
|
-
show_parser.add_argument("name", type=str, help="Dataset name")
|
|
789
|
-
show_parser.add_argument(
|
|
790
|
-
"--version",
|
|
791
|
-
action="store",
|
|
792
|
-
default=None,
|
|
793
|
-
type=int,
|
|
794
|
-
help="Dataset version",
|
|
795
|
-
)
|
|
796
|
-
show_parser.add_argument("--schema", action="store_true", help="Show schema")
|
|
797
|
-
add_show_args(show_parser)
|
|
798
|
-
|
|
799
|
-
query_parser = subp.add_parser(
|
|
800
|
-
"query",
|
|
801
|
-
parents=[parent_parser],
|
|
802
|
-
description="Create a new dataset with a query script",
|
|
803
|
-
)
|
|
804
|
-
query_parser.add_argument(
|
|
805
|
-
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
806
|
-
)
|
|
807
|
-
query_parser.add_argument(
|
|
808
|
-
"--parallel",
|
|
809
|
-
nargs="?",
|
|
810
|
-
type=int,
|
|
811
|
-
const=-1,
|
|
812
|
-
default=None,
|
|
813
|
-
metavar="N",
|
|
814
|
-
help=(
|
|
815
|
-
"Use multiprocessing to run any query script UDFs with N worker processes. "
|
|
816
|
-
"N defaults to the CPU count."
|
|
817
|
-
),
|
|
818
|
-
)
|
|
819
|
-
query_parser.add_argument(
|
|
820
|
-
"-p",
|
|
821
|
-
"--param",
|
|
822
|
-
metavar="param=value",
|
|
823
|
-
nargs=1,
|
|
824
|
-
action=KeyValueArgs,
|
|
825
|
-
help="Query parameters",
|
|
826
|
-
)
|
|
827
|
-
|
|
828
|
-
subp.add_parser(
|
|
829
|
-
"clear-cache", parents=[parent_parser], description="Clear the local file cache"
|
|
830
|
-
)
|
|
831
|
-
subp.add_parser(
|
|
832
|
-
"gc", parents=[parent_parser], description="Garbage collect temporary tables"
|
|
833
|
-
)
|
|
834
|
-
|
|
835
|
-
subp.add_parser("internal-run-udf", parents=[parent_parser])
|
|
836
|
-
subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
|
|
837
|
-
add_completion_parser(subp, [parent_parser])
|
|
838
|
-
return parser
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
def add_completion_parser(subparsers, parents):
|
|
842
|
-
parser = subparsers.add_parser(
|
|
843
|
-
"completion",
|
|
844
|
-
parents=parents,
|
|
845
|
-
description="Output shell completion script",
|
|
846
|
-
)
|
|
847
|
-
parser.add_argument(
|
|
848
|
-
"-s",
|
|
849
|
-
"--shell",
|
|
850
|
-
help="Shell syntax for completions.",
|
|
851
|
-
default="bash",
|
|
852
|
-
choices=shtab.SUPPORTED_SHELLS,
|
|
853
|
-
)
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
def get_logging_level(args: Namespace) -> int:
|
|
857
|
-
if args.quiet:
|
|
858
|
-
return logging.CRITICAL
|
|
859
|
-
if args.verbose:
|
|
860
|
-
return logging.DEBUG
|
|
861
|
-
return logging.INFO
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
def ls_urls(
|
|
865
|
-
sources,
|
|
866
|
-
catalog: "Catalog",
|
|
867
|
-
long: bool = False,
|
|
868
|
-
**kwargs,
|
|
869
|
-
) -> Iterator[tuple[str, Iterator[str]]]:
|
|
870
|
-
curr_dir = None
|
|
871
|
-
value_iterables = []
|
|
872
|
-
for next_dir, values in _ls_urls_flat(sources, long, catalog, **kwargs):
|
|
873
|
-
if curr_dir is None or next_dir == curr_dir: # type: ignore[unreachable]
|
|
874
|
-
value_iterables.append(values)
|
|
875
|
-
else:
|
|
876
|
-
yield curr_dir, chain(*value_iterables) # type: ignore[unreachable]
|
|
877
|
-
value_iterables = [values]
|
|
878
|
-
curr_dir = next_dir
|
|
879
|
-
if curr_dir is not None:
|
|
880
|
-
yield curr_dir, chain(*value_iterables)
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
def _node_data_to_ls_values(row, long_format=False):
|
|
884
|
-
from datachain.node import DirType, long_line_str
|
|
885
|
-
|
|
886
|
-
name = row[0]
|
|
887
|
-
is_dir = row[1] == DirType.DIR
|
|
888
|
-
ending = "/" if is_dir else ""
|
|
889
|
-
value = name + ending
|
|
890
|
-
if long_format:
|
|
891
|
-
last_modified = row[2]
|
|
892
|
-
timestamp = last_modified if not is_dir else None
|
|
893
|
-
return long_line_str(value, timestamp)
|
|
894
|
-
return value
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
def _ls_urls_flat(
|
|
898
|
-
sources,
|
|
899
|
-
long: bool,
|
|
900
|
-
catalog: "Catalog",
|
|
901
|
-
**kwargs,
|
|
902
|
-
) -> Iterator[tuple[str, Iterator[str]]]:
|
|
903
|
-
from datachain.client import Client
|
|
904
|
-
from datachain.node import long_line_str
|
|
905
|
-
|
|
906
|
-
for source in sources:
|
|
907
|
-
client_cls = Client.get_implementation(source)
|
|
908
|
-
if client_cls.is_root_url(source):
|
|
909
|
-
buckets = client_cls.ls_buckets(**catalog.client_config)
|
|
910
|
-
if long:
|
|
911
|
-
values = (long_line_str(b.name, b.created) for b in buckets)
|
|
912
|
-
else:
|
|
913
|
-
values = (b.name for b in buckets)
|
|
914
|
-
yield source, values
|
|
915
|
-
else:
|
|
916
|
-
found = False
|
|
917
|
-
fields = ["name", "dir_type"]
|
|
918
|
-
if long:
|
|
919
|
-
fields.append("last_modified")
|
|
920
|
-
for data_source, results in catalog.ls([source], fields=fields, **kwargs):
|
|
921
|
-
values = (_node_data_to_ls_values(r, long) for r in results)
|
|
922
|
-
found = True
|
|
923
|
-
yield data_source.dirname(), values
|
|
924
|
-
if not found:
|
|
925
|
-
raise FileNotFoundError(f"No such file or directory: {source}")
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
def ls_local(
|
|
929
|
-
sources,
|
|
930
|
-
long: bool = False,
|
|
931
|
-
catalog: Optional["Catalog"] = None,
|
|
932
|
-
client_config=None,
|
|
933
|
-
**kwargs,
|
|
934
|
-
):
|
|
935
|
-
if catalog is None:
|
|
936
|
-
from .catalog import get_catalog
|
|
937
|
-
|
|
938
|
-
catalog = get_catalog(client_config=client_config)
|
|
939
|
-
if sources:
|
|
940
|
-
actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
|
|
941
|
-
if len(actual_sources) == 1:
|
|
942
|
-
for _, entries in actual_sources:
|
|
943
|
-
for entry in entries:
|
|
944
|
-
print(format_ls_entry(entry))
|
|
945
|
-
else:
|
|
946
|
-
first = True
|
|
947
|
-
for source, entries in actual_sources:
|
|
948
|
-
# print a newline between directory listings
|
|
949
|
-
if first:
|
|
950
|
-
first = False
|
|
951
|
-
else:
|
|
952
|
-
print()
|
|
953
|
-
if source:
|
|
954
|
-
print(f"{source}:")
|
|
955
|
-
for entry in entries:
|
|
956
|
-
print(format_ls_entry(entry))
|
|
957
|
-
else:
|
|
958
|
-
chain = DataChain.listings()
|
|
959
|
-
for ls in chain.collect("listing"):
|
|
960
|
-
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
def format_ls_entry(entry: str) -> str:
|
|
964
|
-
if entry.endswith("/") or not entry:
|
|
965
|
-
entry = shlex.quote(entry[:-1])
|
|
966
|
-
return f"{entry}/"
|
|
967
|
-
return shlex.quote(entry)
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
def ls_remote(
|
|
971
|
-
paths: Iterable[str],
|
|
972
|
-
long: bool = False,
|
|
973
|
-
team: Optional[str] = None,
|
|
974
|
-
):
|
|
975
|
-
from datachain.node import long_line_str
|
|
976
|
-
from datachain.remote.studio import StudioClient
|
|
977
|
-
|
|
978
|
-
client = StudioClient(team=team)
|
|
979
|
-
first = True
|
|
980
|
-
for path, response in client.ls(paths):
|
|
981
|
-
if not first:
|
|
982
|
-
print()
|
|
983
|
-
if not response.ok or response.data is None:
|
|
984
|
-
print(f"{path}:\n Error: {response.message}\n")
|
|
985
|
-
continue
|
|
986
|
-
|
|
987
|
-
print(f"{path}:")
|
|
988
|
-
if long:
|
|
989
|
-
for row in response.data:
|
|
990
|
-
entry = long_line_str(
|
|
991
|
-
row["name"] + ("/" if row["dir_type"] else ""),
|
|
992
|
-
row["last_modified"],
|
|
993
|
-
)
|
|
994
|
-
print(format_ls_entry(entry))
|
|
995
|
-
else:
|
|
996
|
-
for row in response.data:
|
|
997
|
-
entry = row["name"] + ("/" if row["dir_type"] else "")
|
|
998
|
-
print(format_ls_entry(entry))
|
|
999
|
-
first = False
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
def ls(
|
|
1003
|
-
sources,
|
|
1004
|
-
long: bool = False,
|
|
1005
|
-
studio: bool = False,
|
|
1006
|
-
local: bool = False,
|
|
1007
|
-
all: bool = True,
|
|
1008
|
-
team: Optional[str] = None,
|
|
1009
|
-
**kwargs,
|
|
1010
|
-
):
|
|
1011
|
-
token = Config().read().get("studio", {}).get("token")
|
|
1012
|
-
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
1013
|
-
|
|
1014
|
-
if all or local:
|
|
1015
|
-
ls_local(sources, long=long, **kwargs)
|
|
1016
|
-
|
|
1017
|
-
if (all or studio) and token:
|
|
1018
|
-
ls_remote(sources, long=long, team=team)
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
def datasets(
|
|
1022
|
-
catalog: "Catalog",
|
|
1023
|
-
studio: bool = False,
|
|
1024
|
-
local: bool = False,
|
|
1025
|
-
all: bool = True,
|
|
1026
|
-
team: Optional[str] = None,
|
|
1027
|
-
):
|
|
1028
|
-
token = Config().read().get("studio", {}).get("token")
|
|
1029
|
-
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
1030
|
-
|
|
1031
|
-
local_datasets = set(list_datasets_local(catalog)) if all or local else set()
|
|
1032
|
-
studio_datasets = (
|
|
1033
|
-
set(list_datasets(team=team)) if (all or studio) and token else set()
|
|
1034
|
-
)
|
|
1035
|
-
|
|
1036
|
-
rows = [
|
|
1037
|
-
_datasets_tabulate_row(
|
|
1038
|
-
name=name,
|
|
1039
|
-
version=version,
|
|
1040
|
-
both=(all or (local and studio)) and token,
|
|
1041
|
-
local=(name, version) in local_datasets,
|
|
1042
|
-
studio=(name, version) in studio_datasets,
|
|
1043
|
-
)
|
|
1044
|
-
for name, version in local_datasets.union(studio_datasets)
|
|
1045
|
-
]
|
|
1046
|
-
|
|
1047
|
-
print(tabulate(rows, headers="keys"))
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
def list_datasets_local(catalog: "Catalog"):
|
|
1051
|
-
for d in catalog.ls_datasets():
|
|
1052
|
-
for v in d.versions:
|
|
1053
|
-
yield (d.name, v.version)
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
def _datasets_tabulate_row(name, version, both, local, studio):
|
|
1057
|
-
row = {
|
|
1058
|
-
"Name": name,
|
|
1059
|
-
"Version": version,
|
|
1060
|
-
}
|
|
1061
|
-
if both:
|
|
1062
|
-
row["Studio"] = "\u2714" if studio else "\u2716"
|
|
1063
|
-
row["Local"] = "\u2714" if local else "\u2716"
|
|
1064
|
-
return row
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
def rm_dataset(
|
|
1068
|
-
catalog: "Catalog",
|
|
1069
|
-
name: str,
|
|
1070
|
-
version: Optional[int] = None,
|
|
1071
|
-
force: Optional[bool] = False,
|
|
1072
|
-
studio: bool = False,
|
|
1073
|
-
local: bool = False,
|
|
1074
|
-
all: bool = True,
|
|
1075
|
-
team: Optional[str] = None,
|
|
1076
|
-
):
|
|
1077
|
-
token = Config().read().get("studio", {}).get("token")
|
|
1078
|
-
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
1079
|
-
|
|
1080
|
-
if all or local:
|
|
1081
|
-
try:
|
|
1082
|
-
catalog.remove_dataset(name, version=version, force=force)
|
|
1083
|
-
except DatasetNotFoundError:
|
|
1084
|
-
print("Dataset not found in local", file=sys.stderr)
|
|
1085
|
-
|
|
1086
|
-
if (all or studio) and token:
|
|
1087
|
-
remove_studio_dataset(team, name, version, force)
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
def edit_dataset(
|
|
1091
|
-
catalog: "Catalog",
|
|
1092
|
-
name: str,
|
|
1093
|
-
new_name: Optional[str] = None,
|
|
1094
|
-
description: Optional[str] = None,
|
|
1095
|
-
labels: Optional[list[str]] = None,
|
|
1096
|
-
studio: bool = False,
|
|
1097
|
-
local: bool = False,
|
|
1098
|
-
all: bool = True,
|
|
1099
|
-
team: Optional[str] = None,
|
|
1100
|
-
):
|
|
1101
|
-
token = Config().read().get("studio", {}).get("token")
|
|
1102
|
-
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
1103
|
-
|
|
1104
|
-
if all or local:
|
|
1105
|
-
try:
|
|
1106
|
-
catalog.edit_dataset(name, new_name, description, labels)
|
|
1107
|
-
except DatasetNotFoundError:
|
|
1108
|
-
print("Dataset not found in local", file=sys.stderr)
|
|
1109
|
-
|
|
1110
|
-
if (all or studio) and token:
|
|
1111
|
-
edit_studio_dataset(team, name, new_name, description, labels)
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
def dataset_stats(
|
|
1115
|
-
catalog: "Catalog",
|
|
1116
|
-
name: str,
|
|
1117
|
-
version: int,
|
|
1118
|
-
show_bytes=False,
|
|
1119
|
-
si=False,
|
|
1120
|
-
):
|
|
1121
|
-
stats = catalog.dataset_stats(name, version)
|
|
1122
|
-
|
|
1123
|
-
if stats:
|
|
1124
|
-
print(f"Number of objects: {stats.num_objects}")
|
|
1125
|
-
if show_bytes:
|
|
1126
|
-
print(f"Total objects size: {stats.size}")
|
|
1127
|
-
else:
|
|
1128
|
-
print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
def du(catalog: "Catalog", sources, show_bytes=False, si=False, **kwargs):
|
|
1132
|
-
for path, size in catalog.du(sources, **kwargs):
|
|
1133
|
-
if show_bytes:
|
|
1134
|
-
print(f"{size} {path}")
|
|
1135
|
-
else:
|
|
1136
|
-
print(f"{utils.sizeof_fmt(size, si=si): >7} {path}")
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
def index(
|
|
1140
|
-
catalog: "Catalog",
|
|
1141
|
-
sources,
|
|
1142
|
-
**kwargs,
|
|
1143
|
-
):
|
|
1144
|
-
catalog.index(sources, **kwargs)
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
def show(
|
|
1148
|
-
catalog: "Catalog",
|
|
1149
|
-
name: str,
|
|
1150
|
-
version: Optional[int] = None,
|
|
1151
|
-
limit: int = 10,
|
|
1152
|
-
offset: int = 0,
|
|
1153
|
-
columns: Sequence[str] = (),
|
|
1154
|
-
no_collapse: bool = False,
|
|
1155
|
-
schema: bool = False,
|
|
1156
|
-
) -> None:
|
|
1157
|
-
from datachain.lib.dc import DataChain
|
|
1158
|
-
from datachain.query.dataset import DatasetQuery
|
|
1159
|
-
from datachain.utils import show_records
|
|
1160
|
-
|
|
1161
|
-
dataset = catalog.get_dataset(name)
|
|
1162
|
-
dataset_version = dataset.get_version(version or dataset.latest_version)
|
|
1163
|
-
|
|
1164
|
-
query = (
|
|
1165
|
-
DatasetQuery(name=name, version=version, catalog=catalog)
|
|
1166
|
-
.select(*columns)
|
|
1167
|
-
.limit(limit)
|
|
1168
|
-
.offset(offset)
|
|
1169
|
-
)
|
|
1170
|
-
records = query.to_db_records()
|
|
1171
|
-
show_records(records, collapse_columns=not no_collapse)
|
|
1172
|
-
if schema and dataset_version.feature_schema:
|
|
1173
|
-
print("\nSchema:")
|
|
1174
|
-
session = Session.get(catalog=catalog)
|
|
1175
|
-
dc = DataChain.from_dataset(name=name, version=version, session=session)
|
|
1176
|
-
dc.print_schema()
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
def query(
|
|
1180
|
-
catalog: "Catalog",
|
|
1181
|
-
script: str,
|
|
1182
|
-
parallel: Optional[int] = None,
|
|
1183
|
-
params: Optional[dict[str, str]] = None,
|
|
1184
|
-
) -> None:
|
|
1185
|
-
from datachain.data_storage import JobQueryType, JobStatus
|
|
1186
|
-
|
|
1187
|
-
with open(script, encoding="utf-8") as f:
|
|
1188
|
-
script_content = f.read()
|
|
1189
|
-
|
|
1190
|
-
if parallel is not None:
|
|
1191
|
-
# This also sets this environment variable for any subprocesses
|
|
1192
|
-
os.environ["DATACHAIN_SETTINGS_PARALLEL"] = str(parallel)
|
|
1193
|
-
|
|
1194
|
-
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
1195
|
-
python_executable = sys.executable
|
|
1196
|
-
|
|
1197
|
-
job_id = catalog.metastore.create_job(
|
|
1198
|
-
name=os.path.basename(script),
|
|
1199
|
-
query=script_content,
|
|
1200
|
-
query_type=JobQueryType.PYTHON,
|
|
1201
|
-
python_version=python_version,
|
|
1202
|
-
params=params,
|
|
1203
|
-
)
|
|
1204
|
-
|
|
1205
|
-
try:
|
|
1206
|
-
catalog.query(
|
|
1207
|
-
script_content,
|
|
1208
|
-
python_executable=python_executable,
|
|
1209
|
-
params=params,
|
|
1210
|
-
job_id=job_id,
|
|
1211
|
-
)
|
|
1212
|
-
except Exception as e:
|
|
1213
|
-
error_message = str(e)
|
|
1214
|
-
error_stack = traceback.format_exc()
|
|
1215
|
-
catalog.metastore.set_job_status(
|
|
1216
|
-
job_id,
|
|
1217
|
-
JobStatus.FAILED,
|
|
1218
|
-
error_message=error_message,
|
|
1219
|
-
error_stack=error_stack,
|
|
1220
|
-
)
|
|
1221
|
-
raise
|
|
1222
|
-
catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
def clear_cache(catalog: "Catalog"):
|
|
1226
|
-
catalog.cache.clear()
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
def garbage_collect(catalog: "Catalog"):
|
|
1230
|
-
temp_tables = catalog.get_temp_table_names()
|
|
1231
|
-
if not temp_tables:
|
|
1232
|
-
print("Nothing to clean up.")
|
|
1233
|
-
else:
|
|
1234
|
-
print(f"Garbage collecting {len(temp_tables)} tables.")
|
|
1235
|
-
catalog.cleanup_tables(temp_tables)
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
def completion(shell: str) -> str:
|
|
1239
|
-
return shtab.complete(
|
|
1240
|
-
get_parser(),
|
|
1241
|
-
shell=shell,
|
|
1242
|
-
)
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
def _determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
|
|
1246
|
-
if studio and not token:
|
|
1247
|
-
raise DataChainError(
|
|
1248
|
-
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
1249
|
-
)
|
|
1250
|
-
|
|
1251
|
-
if local or studio:
|
|
1252
|
-
all = False
|
|
1253
|
-
|
|
1254
|
-
all = all and not (local or studio)
|
|
1255
|
-
|
|
1256
|
-
return all, local, studio
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR0915
|
|
1260
|
-
# Required for Windows multiprocessing support
|
|
1261
|
-
freeze_support()
|
|
1262
|
-
|
|
1263
|
-
parser = get_parser()
|
|
1264
|
-
args = parser.parse_args(argv)
|
|
1265
|
-
|
|
1266
|
-
if args.command == "internal-run-udf":
|
|
1267
|
-
from datachain.query.dispatch import udf_entrypoint
|
|
1268
|
-
|
|
1269
|
-
return udf_entrypoint()
|
|
1270
|
-
|
|
1271
|
-
if args.command == "internal-run-udf-worker":
|
|
1272
|
-
from datachain.query.dispatch import udf_worker_entrypoint
|
|
1273
|
-
|
|
1274
|
-
return udf_worker_entrypoint()
|
|
1275
|
-
|
|
1276
|
-
from .catalog import get_catalog
|
|
1277
|
-
|
|
1278
|
-
logger.addHandler(logging.StreamHandler())
|
|
1279
|
-
logging_level = get_logging_level(args)
|
|
1280
|
-
logger.setLevel(logging_level)
|
|
1281
|
-
|
|
1282
|
-
client_config = {
|
|
1283
|
-
"aws_endpoint_url": args.aws_endpoint_url,
|
|
1284
|
-
"anon": args.anon,
|
|
1285
|
-
}
|
|
1286
|
-
|
|
1287
|
-
if args.debug_sql:
|
|
1288
|
-
# This also sets this environment variable for any subprocesses
|
|
1289
|
-
os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
|
|
1290
|
-
|
|
1291
|
-
error = None
|
|
1292
|
-
try:
|
|
1293
|
-
catalog = get_catalog(client_config=client_config)
|
|
1294
|
-
if args.command == "cp":
|
|
1295
|
-
catalog.cp(
|
|
1296
|
-
args.sources,
|
|
1297
|
-
args.output,
|
|
1298
|
-
force=bool(args.force),
|
|
1299
|
-
update=bool(args.update),
|
|
1300
|
-
recursive=bool(args.recursive),
|
|
1301
|
-
edatachain_file=None,
|
|
1302
|
-
edatachain_only=False,
|
|
1303
|
-
no_edatachain_file=True,
|
|
1304
|
-
no_glob=args.no_glob,
|
|
1305
|
-
)
|
|
1306
|
-
elif args.command == "clone":
|
|
1307
|
-
catalog.clone(
|
|
1308
|
-
args.sources,
|
|
1309
|
-
args.output,
|
|
1310
|
-
force=bool(args.force),
|
|
1311
|
-
update=bool(args.update),
|
|
1312
|
-
recursive=bool(args.recursive),
|
|
1313
|
-
no_glob=args.no_glob,
|
|
1314
|
-
no_cp=args.no_cp,
|
|
1315
|
-
edatachain=args.edatachain,
|
|
1316
|
-
edatachain_file=args.edatachain_file,
|
|
1317
|
-
)
|
|
1318
|
-
elif args.command in ("datasets", "ds"):
|
|
1319
|
-
if args.datasets_cmd == "pull":
|
|
1320
|
-
catalog.pull_dataset(
|
|
1321
|
-
args.dataset,
|
|
1322
|
-
args.output,
|
|
1323
|
-
local_ds_name=args.local_name,
|
|
1324
|
-
local_ds_version=args.local_version,
|
|
1325
|
-
cp=args.cp,
|
|
1326
|
-
force=bool(args.force),
|
|
1327
|
-
edatachain=args.edatachain,
|
|
1328
|
-
edatachain_file=args.edatachain_file,
|
|
1329
|
-
)
|
|
1330
|
-
elif args.datasets_cmd == "edit":
|
|
1331
|
-
edit_dataset(
|
|
1332
|
-
catalog,
|
|
1333
|
-
args.name,
|
|
1334
|
-
new_name=args.new_name,
|
|
1335
|
-
description=args.description,
|
|
1336
|
-
labels=args.labels,
|
|
1337
|
-
studio=args.studio,
|
|
1338
|
-
local=args.local,
|
|
1339
|
-
all=args.all,
|
|
1340
|
-
team=args.team,
|
|
1341
|
-
)
|
|
1342
|
-
elif args.datasets_cmd == "ls":
|
|
1343
|
-
datasets(
|
|
1344
|
-
catalog=catalog,
|
|
1345
|
-
studio=args.studio,
|
|
1346
|
-
local=args.local,
|
|
1347
|
-
all=args.all,
|
|
1348
|
-
team=args.team,
|
|
1349
|
-
)
|
|
1350
|
-
elif args.datasets_cmd in ("rm", "remove"):
|
|
1351
|
-
rm_dataset(
|
|
1352
|
-
catalog,
|
|
1353
|
-
args.name,
|
|
1354
|
-
version=args.version,
|
|
1355
|
-
force=args.force,
|
|
1356
|
-
studio=args.studio,
|
|
1357
|
-
local=args.local,
|
|
1358
|
-
all=args.all,
|
|
1359
|
-
team=args.team,
|
|
1360
|
-
)
|
|
1361
|
-
elif args.datasets_cmd == "stats":
|
|
1362
|
-
dataset_stats(
|
|
1363
|
-
catalog,
|
|
1364
|
-
args.name,
|
|
1365
|
-
args.version,
|
|
1366
|
-
show_bytes=args.bytes,
|
|
1367
|
-
si=args.si,
|
|
1368
|
-
)
|
|
1369
|
-
else:
|
|
1370
|
-
raise Exception(f"Unexpected command {args.datasets_cmd}")
|
|
1371
|
-
elif args.command == "ls":
|
|
1372
|
-
ls(
|
|
1373
|
-
args.sources,
|
|
1374
|
-
long=bool(args.long),
|
|
1375
|
-
studio=args.studio,
|
|
1376
|
-
local=args.local,
|
|
1377
|
-
all=args.all,
|
|
1378
|
-
team=args.team,
|
|
1379
|
-
update=bool(args.update),
|
|
1380
|
-
client_config=client_config,
|
|
1381
|
-
)
|
|
1382
|
-
|
|
1383
|
-
elif args.command == "show":
|
|
1384
|
-
show(
|
|
1385
|
-
catalog,
|
|
1386
|
-
args.name,
|
|
1387
|
-
args.version,
|
|
1388
|
-
limit=args.limit,
|
|
1389
|
-
offset=args.offset,
|
|
1390
|
-
columns=args.columns,
|
|
1391
|
-
no_collapse=args.no_collapse,
|
|
1392
|
-
schema=args.schema,
|
|
1393
|
-
)
|
|
1394
|
-
|
|
1395
|
-
elif args.command == "du":
|
|
1396
|
-
du(
|
|
1397
|
-
catalog,
|
|
1398
|
-
args.sources,
|
|
1399
|
-
show_bytes=args.bytes,
|
|
1400
|
-
depth=args.depth,
|
|
1401
|
-
si=args.si,
|
|
1402
|
-
update=bool(args.update),
|
|
1403
|
-
client_config=client_config,
|
|
1404
|
-
)
|
|
1405
|
-
elif args.command == "find":
|
|
1406
|
-
results_found = False
|
|
1407
|
-
for result in catalog.find(
|
|
1408
|
-
args.sources,
|
|
1409
|
-
update=bool(args.update),
|
|
1410
|
-
names=args.name,
|
|
1411
|
-
inames=args.iname,
|
|
1412
|
-
paths=args.path,
|
|
1413
|
-
ipaths=args.ipath,
|
|
1414
|
-
size=args.size,
|
|
1415
|
-
typ=args.type,
|
|
1416
|
-
columns=args.columns,
|
|
1417
|
-
):
|
|
1418
|
-
print(result)
|
|
1419
|
-
results_found = True
|
|
1420
|
-
if not results_found:
|
|
1421
|
-
print("No results")
|
|
1422
|
-
elif args.command == "index":
|
|
1423
|
-
index(
|
|
1424
|
-
catalog,
|
|
1425
|
-
args.sources,
|
|
1426
|
-
update=bool(args.update),
|
|
1427
|
-
)
|
|
1428
|
-
elif args.command == "completion":
|
|
1429
|
-
print(completion(args.shell))
|
|
1430
|
-
elif args.command == "query":
|
|
1431
|
-
query(
|
|
1432
|
-
catalog,
|
|
1433
|
-
args.script,
|
|
1434
|
-
parallel=args.parallel,
|
|
1435
|
-
params=args.param,
|
|
1436
|
-
)
|
|
1437
|
-
elif args.command == "clear-cache":
|
|
1438
|
-
clear_cache(catalog)
|
|
1439
|
-
elif args.command == "gc":
|
|
1440
|
-
garbage_collect(catalog)
|
|
1441
|
-
elif args.command == "studio":
|
|
1442
|
-
process_studio_cli_args(args)
|
|
1443
|
-
else:
|
|
1444
|
-
print(f"invalid command: {args.command}", file=sys.stderr)
|
|
1445
|
-
return 1
|
|
1446
|
-
return 0
|
|
1447
|
-
except BrokenPipeError as exc:
|
|
1448
|
-
# Python flushes standard streams on exit; redirect remaining output
|
|
1449
|
-
# to devnull to avoid another BrokenPipeError at shutdown
|
|
1450
|
-
# See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
|
1451
|
-
error = str(exc)
|
|
1452
|
-
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
1453
|
-
os.dup2(devnull, sys.stdout.fileno())
|
|
1454
|
-
return 141 # 128 + 13 (SIGPIPE)
|
|
1455
|
-
except (KeyboardInterrupt, Exception) as exc:
|
|
1456
|
-
error = str(exc)
|
|
1457
|
-
if isinstance(exc, KeyboardInterrupt):
|
|
1458
|
-
msg = "Operation cancelled by the user"
|
|
1459
|
-
else:
|
|
1460
|
-
msg = str(exc)
|
|
1461
|
-
print("Error:", msg, file=sys.stderr)
|
|
1462
|
-
if logging_level <= logging.DEBUG:
|
|
1463
|
-
traceback.print_exception(
|
|
1464
|
-
type(exc),
|
|
1465
|
-
exc,
|
|
1466
|
-
exc.__traceback__,
|
|
1467
|
-
file=sys.stderr,
|
|
1468
|
-
)
|
|
1469
|
-
if args.pdb:
|
|
1470
|
-
import pdb # noqa: T100
|
|
1471
|
-
|
|
1472
|
-
pdb.post_mortem()
|
|
1473
|
-
return 1
|
|
1474
|
-
finally:
|
|
1475
|
-
telemetry.send_cli_call(args.command, error=error)
|