datachain 0.8.10__py3-none-any.whl → 0.8.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cache.py CHANGED
@@ -22,15 +22,15 @@ def try_scandir(path):
22
22
  pass
23
23
 
24
24
 
25
- def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
25
+ def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
26
26
  cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
27
- return DataChainCache(cache_dir, tmp_dir=tmp_dir)
27
+ return Cache(cache_dir, tmp_dir=tmp_dir)
28
28
 
29
29
 
30
30
  @contextmanager
31
31
  def temporary_cache(
32
32
  tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
33
- ) -> Iterator["DataChainCache"]:
33
+ ) -> Iterator["Cache"]:
34
34
  cache = get_temp_cache(tmp_dir, prefix=prefix)
35
35
  try:
36
36
  yield cache
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class DataChainCache:
42
+ class Cache:
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -3,7 +3,6 @@ from .catalog import (
3
3
  QUERY_SCRIPT_CANCELED_EXIT_CODE,
4
4
  QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
5
5
  Catalog,
6
- parse_edatachain_file,
7
6
  )
8
7
  from .loader import get_catalog
9
8
 
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
14
13
  "Catalog",
15
14
  "get_catalog",
16
- "parse_edatachain_file",
17
15
  ]
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  import os.path
6
6
  import posixpath
7
+ import signal
7
8
  import subprocess
8
9
  import sys
9
10
  import time
@@ -26,11 +27,10 @@ from uuid import uuid4
26
27
 
27
28
  import requests
28
29
  import sqlalchemy as sa
29
- import yaml
30
30
  from sqlalchemy import Column
31
31
  from tqdm.auto import tqdm
32
32
 
33
- from datachain.cache import DataChainCache
33
+ from datachain.cache import Cache
34
34
  from datachain.client import Client
35
35
  from datachain.dataset import (
36
36
  DATASET_PREFIX,
@@ -57,7 +57,7 @@ from datachain.node import DirType, Node, NodeWithPath
57
57
  from datachain.nodes_thread_pool import NodesThreadPool
58
58
  from datachain.remote.studio import StudioClient
59
59
  from datachain.sql.types import DateTime, SQLType
60
- from datachain.utils import DataChainDir, datachain_paths_join
60
+ from datachain.utils import DataChainDir
61
61
 
62
62
  from .datasource import DataSource
63
63
 
@@ -73,7 +73,6 @@ if TYPE_CHECKING:
73
73
  logger = logging.getLogger("datachain")
74
74
 
75
75
  DEFAULT_DATASET_DIR = "dataset"
76
- DATASET_FILE_SUFFIX = ".edatachain"
77
76
 
78
77
  TTL_INT = 4 * 60 * 60
79
78
 
@@ -99,6 +98,47 @@ def noop(_: str):
99
98
  pass
100
99
 
101
100
 
101
+ class TerminationSignal(RuntimeError): # noqa: N818
102
+ def __init__(self, signal):
103
+ self.signal = signal
104
+ super().__init__("Received termination signal", signal)
105
+
106
+ def __repr__(self):
107
+ return f"{self.__class__.__name__}({self.signal})"
108
+
109
+
110
+ if sys.platform == "win32":
111
+ SIGINT = signal.CTRL_C_EVENT
112
+ else:
113
+ SIGINT = signal.SIGINT
114
+
115
+
116
+ def shutdown_process(
117
+ proc: subprocess.Popen,
118
+ interrupt_timeout: Optional[int] = None,
119
+ terminate_timeout: Optional[int] = None,
120
+ ) -> int:
121
+ """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
122
+
123
+ logger.info("sending interrupt signal to the process %s", proc.pid)
124
+ proc.send_signal(SIGINT)
125
+
126
+ logger.info("waiting for the process %s to finish", proc.pid)
127
+ try:
128
+ return proc.wait(interrupt_timeout)
129
+ except subprocess.TimeoutExpired:
130
+ logger.info(
131
+ "timed out waiting, sending terminate signal to the process %s", proc.pid
132
+ )
133
+ proc.terminate()
134
+ try:
135
+ return proc.wait(terminate_timeout)
136
+ except subprocess.TimeoutExpired:
137
+ logger.info("timed out waiting, killing the process %s", proc.pid)
138
+ proc.kill()
139
+ return proc.wait()
140
+
141
+
102
142
  def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
103
143
  buffer = b""
104
144
  while byt := stream.read(1): # Read one byte at a time
@@ -247,7 +287,6 @@ class NodeGroup:
247
287
  # The source path within the bucket
248
288
  # (not including the bucket name or s3:// prefix)
249
289
  source_path: str = ""
250
- is_edatachain: bool = False
251
290
  dataset_name: Optional[str] = None
252
291
  dataset_version: Optional[int] = None
253
292
  instantiated_nodes: Optional[list[NodeWithPath]] = None
@@ -272,55 +311,11 @@ class NodeGroup:
272
311
  self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
273
312
 
274
313
 
275
- def check_output_dataset_file(
276
- output: str,
277
- force: bool = False,
278
- dataset_filename: Optional[str] = None,
279
- skip_check_edatachain: bool = False,
280
- ) -> str:
281
- """
282
- Checks the dataset filename for existence or if it should be force-overwritten.
283
- """
284
- dataset_file = (
285
- dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
286
- )
287
- if not skip_check_edatachain and os.path.exists(dataset_file):
288
- if force:
289
- os.remove(dataset_file)
290
- else:
291
- raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
292
- return dataset_file
293
-
294
-
295
- def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
296
- with open(filename, encoding="utf-8") as f:
297
- contents = yaml.safe_load(f)
298
-
299
- if not isinstance(contents, list):
300
- contents = [contents]
301
-
302
- for entry in contents:
303
- if not isinstance(entry, dict):
304
- raise TypeError(
305
- "Failed parsing EDataChain file, "
306
- "each data source entry must be a dictionary"
307
- )
308
- if "data-source" not in entry or "files" not in entry:
309
- raise ValueError(
310
- "Failed parsing EDataChain file, "
311
- "each data source entry must contain the "
312
- '"data-source" and "files" keys'
313
- )
314
-
315
- return contents
316
-
317
-
318
314
  def prepare_output_for_cp(
319
315
  node_groups: list[NodeGroup],
320
316
  output: str,
321
317
  force: bool = False,
322
- edatachain_only: bool = False,
323
- no_edatachain_file: bool = False,
318
+ no_cp: bool = False,
324
319
  ) -> tuple[bool, Optional[str]]:
325
320
  total_node_count = 0
326
321
  for node_group in node_groups:
@@ -333,7 +328,7 @@ def prepare_output_for_cp(
333
328
  always_copy_dir_contents = False
334
329
  copy_to_filename = None
335
330
 
336
- if edatachain_only:
331
+ if no_cp:
337
332
  return always_copy_dir_contents, copy_to_filename
338
333
 
339
334
  if not os.path.isdir(output):
@@ -358,10 +353,6 @@ def prepare_output_for_cp(
358
353
  copy_to_filename = output
359
354
  else:
360
355
  raise FileNotFoundError(f"Is not a directory: {output}")
361
-
362
- if copy_to_filename and not no_edatachain_file:
363
- raise RuntimeError("File to file cp not supported with .edatachain files!")
364
-
365
356
  return always_copy_dir_contents, copy_to_filename
366
357
 
367
358
 
@@ -465,8 +456,6 @@ def instantiate_node_groups(
465
456
  copy_to_filename,
466
457
  recursive,
467
458
  copy_dir_contents,
468
- source_path,
469
- node_group.is_edatachain,
470
459
  node_group.is_dataset,
471
460
  )
472
461
  if not virtual_only:
@@ -484,24 +473,6 @@ def instantiate_node_groups(
484
473
  instantiate_progress_bar.close()
485
474
 
486
475
 
487
- def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
488
- metafile_data = []
489
- for node_group in node_groups:
490
- if not node_group.sources:
491
- continue
492
- listing: Listing = node_group.listing
493
- metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
494
- for node in node_group.instantiated_nodes:
495
- if not node.n.is_dir:
496
- metafile_group["files"].append( # type: ignore [attr-defined]
497
- node.get_metafile_data()
498
- )
499
- if metafile_group["files"]:
500
- metafile_data.append(metafile_group)
501
-
502
- return metafile_data
503
-
504
-
505
476
  def find_column_to_str( # noqa: PLR0911
506
477
  row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
507
478
  ) -> str:
@@ -536,7 +507,7 @@ def find_column_to_str( # noqa: PLR0911
536
507
  return ""
537
508
 
538
509
 
539
- def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
510
+ def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
540
511
  clone = catalog.copy()
541
512
  clone.cache = cache
542
513
  return clone
@@ -559,7 +530,7 @@ class Catalog:
559
530
  datachain_dir.init()
560
531
  self.metastore = metastore
561
532
  self._warehouse = warehouse
562
- self.cache = DataChainCache(datachain_dir.cache, datachain_dir.tmp)
533
+ self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
563
534
  self.client_config = client_config if client_config is not None else {}
564
535
  self._init_params = {
565
536
  "cache_dir": cache_dir,
@@ -703,22 +674,8 @@ class Catalog:
703
674
  enlisted_sources: list[tuple[bool, bool, Any]] = []
704
675
  client_config = client_config or self.client_config
705
676
  for src in sources: # Opt: parallel
706
- if src.endswith(DATASET_FILE_SUFFIX) and os.path.isfile(src):
707
- # TODO: Also allow using EDataChain files from cloud locations?
708
- edatachain_data = parse_edatachain_file(src)
709
- indexed_sources = []
710
- for ds in edatachain_data:
711
- listing, _, source_path = self.enlist_source(
712
- ds["data-source"]["uri"],
713
- update,
714
- client_config=client_config,
715
- )
716
- paths = datachain_paths_join(
717
- source_path, (f["name"] for f in ds["files"])
718
- )
719
- indexed_sources.append((listing, source_path, paths))
720
- enlisted_sources.append((True, False, indexed_sources))
721
- elif src.startswith("ds://"):
677
+ listing: Optional[Listing]
678
+ if src.startswith("ds://"):
722
679
  ds_name, ds_version = parse_dataset_uri(src)
723
680
  dataset = self.get_dataset(ds_name)
724
681
  if not ds_version:
@@ -796,7 +753,6 @@ class Catalog:
796
753
  listing.client,
797
754
  dsrc,
798
755
  source_path,
799
- is_edatachain=True,
800
756
  )
801
757
  )
802
758
  else:
@@ -1360,8 +1316,6 @@ class Catalog:
1360
1316
  local_ds_version: Optional[int] = None,
1361
1317
  cp: bool = False,
1362
1318
  force: bool = False,
1363
- edatachain: bool = False,
1364
- edatachain_file: Optional[str] = None,
1365
1319
  *,
1366
1320
  client_config=None,
1367
1321
  ) -> None:
@@ -1373,8 +1327,6 @@ class Catalog:
1373
1327
  [ds_uri],
1374
1328
  output,
1375
1329
  force=force,
1376
- no_edatachain_file=not edatachain,
1377
- edatachain_file=edatachain_file,
1378
1330
  client_config=client_config,
1379
1331
  )
1380
1332
  print(f"Dataset {ds_uri} instantiated locally to {output}")
@@ -1541,8 +1493,6 @@ class Catalog:
1541
1493
  recursive: bool = False,
1542
1494
  no_glob: bool = False,
1543
1495
  no_cp: bool = False,
1544
- edatachain: bool = False,
1545
- edatachain_file: Optional[str] = None,
1546
1496
  *,
1547
1497
  client_config=None,
1548
1498
  ) -> None:
@@ -1551,9 +1501,8 @@ class Catalog:
1551
1501
  them into the dataset folder.
1552
1502
  It also adds those files to a dataset in database, which is
1553
1503
  created if doesn't exist yet
1554
- Optionally, it creates a .edatachain file
1555
1504
  """
1556
- if not no_cp or edatachain:
1505
+ if not no_cp:
1557
1506
  self.cp(
1558
1507
  sources,
1559
1508
  output,
@@ -1561,9 +1510,7 @@ class Catalog:
1561
1510
  update=update,
1562
1511
  recursive=recursive,
1563
1512
  no_glob=no_glob,
1564
- edatachain_only=no_cp,
1565
- no_edatachain_file=not edatachain,
1566
- edatachain_file=edatachain_file,
1513
+ no_cp=no_cp,
1567
1514
  client_config=client_config,
1568
1515
  )
1569
1516
  else:
@@ -1588,6 +1535,8 @@ class Catalog:
1588
1535
  output_hook: Callable[[str], None] = noop,
1589
1536
  params: Optional[dict[str, str]] = None,
1590
1537
  job_id: Optional[str] = None,
1538
+ interrupt_timeout: Optional[int] = None,
1539
+ terminate_timeout: Optional[int] = None,
1591
1540
  ) -> None:
1592
1541
  cmd = [python_executable, "-c", query_script]
1593
1542
  env = dict(env or os.environ)
@@ -1601,13 +1550,48 @@ class Catalog:
1601
1550
  if capture_output:
1602
1551
  popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1603
1552
 
1553
+ def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1554
+ raise TerminationSignal(sig)
1555
+
1556
+ thread: Optional[Thread] = None
1604
1557
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1605
- if capture_output:
1606
- args = (proc.stdout, output_hook)
1607
- thread = Thread(target=_process_stream, args=args, daemon=True)
1608
- thread.start()
1609
- thread.join() # wait for the reader thread
1558
+ logger.info("Starting process %s", proc.pid)
1559
+
1560
+ orig_sigint_handler = signal.getsignal(signal.SIGINT)
1561
+ # ignore SIGINT in the main process.
1562
+ # In the terminal, SIGINTs are received by all the processes in
1563
+ # the foreground process group, so the script will receive the signal too.
1564
+ # (If we forward the signal to the child, it will receive it twice.)
1565
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
1610
1566
 
1567
+ orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1568
+ signal.signal(signal.SIGTERM, raise_termination_signal)
1569
+ try:
1570
+ if capture_output:
1571
+ args = (proc.stdout, output_hook)
1572
+ thread = Thread(target=_process_stream, args=args, daemon=True)
1573
+ thread.start()
1574
+
1575
+ proc.wait()
1576
+ except TerminationSignal as exc:
1577
+ signal.signal(signal.SIGTERM, orig_sigterm_handler)
1578
+ signal.signal(signal.SIGINT, orig_sigint_handler)
1579
+ logging.info("Shutting down process %s, received %r", proc.pid, exc)
1580
+ # Rather than forwarding the signal to the child, we try to shut it down
1581
+ # gracefully. This is because we consider the script to be interactive
1582
+ # and special, so we give it time to cleanup before exiting.
1583
+ shutdown_process(proc, interrupt_timeout, terminate_timeout)
1584
+ if proc.returncode:
1585
+ raise QueryScriptCancelError(
1586
+ "Query script was canceled by user", return_code=proc.returncode
1587
+ ) from exc
1588
+ finally:
1589
+ signal.signal(signal.SIGTERM, orig_sigterm_handler)
1590
+ signal.signal(signal.SIGINT, orig_sigint_handler)
1591
+ if thread:
1592
+ thread.join() # wait for the reader thread
1593
+
1594
+ logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1611
1595
  if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
1612
1596
  raise QueryScriptCancelError(
1613
1597
  "Query script was canceled by user",
@@ -1626,17 +1610,14 @@ class Catalog:
1626
1610
  force: bool = False,
1627
1611
  update: bool = False,
1628
1612
  recursive: bool = False,
1629
- edatachain_file: Optional[str] = None,
1630
- edatachain_only: bool = False,
1631
- no_edatachain_file: bool = False,
1613
+ no_cp: bool = False,
1632
1614
  no_glob: bool = False,
1633
1615
  *,
1634
- client_config=None,
1635
- ) -> list[dict[str, Any]]:
1616
+ client_config: Optional["dict"] = None,
1617
+ ) -> None:
1636
1618
  """
1637
1619
  This function copies files from cloud sources to local destination directory
1638
1620
  If cloud source is not indexed, or has expired index, it runs indexing
1639
- It also creates .edatachain file by default, if not specified differently
1640
1621
  """
1641
1622
  client_config = client_config or self.client_config
1642
1623
  node_groups = self.enlist_sources_grouped(
@@ -1647,17 +1628,11 @@ class Catalog:
1647
1628
  )
1648
1629
 
1649
1630
  always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
1650
- node_groups, output, force, edatachain_only, no_edatachain_file
1631
+ node_groups, output, force, no_cp
1651
1632
  )
1652
- dataset_file = check_output_dataset_file(
1653
- output, force, edatachain_file, no_edatachain_file
1654
- )
1655
-
1656
1633
  total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
1657
-
1658
- if total_files == 0:
1659
- # Nothing selected to cp
1660
- return []
1634
+ if not total_files:
1635
+ return
1661
1636
 
1662
1637
  desc_max_len = max(len(output) + 16, 19)
1663
1638
  bar_format = (
@@ -1667,7 +1642,7 @@ class Catalog:
1667
1642
  "[{elapsed}<{remaining}, {rate_fmt:>8}]"
1668
1643
  )
1669
1644
 
1670
- if not edatachain_only:
1645
+ if not no_cp:
1671
1646
  with get_download_bar(bar_format, total_size) as pbar:
1672
1647
  for node_group in node_groups:
1673
1648
  node_group.download(recursive=recursive, pbar=pbar)
@@ -1679,21 +1654,10 @@ class Catalog:
1679
1654
  total_files,
1680
1655
  force,
1681
1656
  recursive,
1682
- edatachain_only,
1657
+ no_cp,
1683
1658
  always_copy_dir_contents,
1684
1659
  copy_to_filename,
1685
1660
  )
1686
- if no_edatachain_file:
1687
- return []
1688
-
1689
- metafile_data = compute_metafile_data(node_groups)
1690
- if metafile_data:
1691
- # Don't write the metafile if nothing was copied
1692
- print(f"Creating '{dataset_file}'")
1693
- with open(dataset_file, "w", encoding="utf-8") as fd:
1694
- yaml.dump(metafile_data, fd, sort_keys=False)
1695
-
1696
- return metafile_data
1697
1661
 
1698
1662
  def du(
1699
1663
  self,
datachain/cli/__init__.py CHANGED
@@ -47,9 +47,13 @@ def main(argv: Optional[list[str]] = None) -> int:
47
47
  logging_level = get_logging_level(args)
48
48
  logger.setLevel(logging_level)
49
49
 
50
- client_config = {
51
- "anon": args.anon,
52
- }
50
+ client_config = (
51
+ {
52
+ "anon": args.anon,
53
+ }
54
+ if getattr(args, "anon", False)
55
+ else {}
56
+ )
53
57
 
54
58
  if args.debug_sql:
55
59
  # This also sets this environment variable for any subprocesses
@@ -107,9 +111,6 @@ def handle_cp_command(args, catalog):
107
111
  force=bool(args.force),
108
112
  update=bool(args.update),
109
113
  recursive=bool(args.recursive),
110
- edatachain_file=None,
111
- edatachain_only=False,
112
- no_edatachain_file=True,
113
114
  no_glob=args.no_glob,
114
115
  )
115
116
 
@@ -8,7 +8,14 @@ from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
8
8
 
9
9
  from .job import add_jobs_parser
10
10
  from .studio import add_auth_parser
11
- from .utils import FIND_COLUMNS, add_show_args, add_sources_arg, find_columns_type
11
+ from .utils import (
12
+ FIND_COLUMNS,
13
+ add_anon_arg,
14
+ add_show_args,
15
+ add_sources_arg,
16
+ add_update_arg,
17
+ find_columns_type,
18
+ )
12
19
 
13
20
 
14
21
  def get_parser() -> ArgumentParser: # noqa: PLR0915
@@ -32,19 +39,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
32
39
  "-q", "--quiet", action="count", default=0, help="Be quiet"
33
40
  )
34
41
 
35
- parent_parser.add_argument(
36
- "--anon",
37
- action="store_true",
38
- help="Use anonymous access to storage",
39
- )
40
- parent_parser.add_argument(
41
- "-u",
42
- "--update",
43
- action="count",
44
- default=0,
45
- help="Update cached list of files for the sources",
46
- )
47
-
48
42
  parent_parser.add_argument(
49
43
  "--debug-sql",
50
44
  action="store_true",
@@ -92,6 +86,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
92
86
  action="store_true",
93
87
  help="Do not expand globs (such as * or ?)",
94
88
  )
89
+ add_anon_arg(parse_cp)
90
+ add_update_arg(parse_cp)
95
91
 
96
92
  parse_clone = subp.add_parser(
97
93
  "clone", parents=[parent_parser], description="Copy data files from the cloud."
@@ -127,6 +123,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
127
123
  action="store_true",
128
124
  help="Do not copy files, just create a dataset",
129
125
  )
126
+ add_anon_arg(parse_clone)
127
+ add_update_arg(parse_clone)
130
128
 
131
129
  add_auth_parser(subp, parent_parser)
132
130
  add_jobs_parser(subp, parent_parser)
@@ -137,6 +135,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
137
135
  parents=[parent_parser],
138
136
  description="Commands for managing datasets.",
139
137
  )
138
+ add_anon_arg(datasets_parser)
140
139
  datasets_subparser = datasets_parser.add_subparsers(
141
140
  dest="datasets_cmd",
142
141
  help="Use `datachain dataset CMD --help` to display command-specific help",
@@ -336,6 +335,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
336
335
  parse_ls = subp.add_parser(
337
336
  "ls", parents=[parent_parser], description="List storage contents."
338
337
  )
338
+ add_anon_arg(parse_ls)
339
+ add_update_arg(parse_ls)
339
340
  add_sources_arg(parse_ls, nargs="*")
340
341
  parse_ls.add_argument(
341
342
  "-l",
@@ -375,6 +376,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
375
376
  "du", parents=[parent_parser], description="Display space usage."
376
377
  )
377
378
  add_sources_arg(parse_du)
379
+ add_anon_arg(parse_du)
380
+ add_update_arg(parse_du)
378
381
  parse_du.add_argument(
379
382
  "-b",
380
383
  "--bytes",
@@ -404,6 +407,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
404
407
  parse_find = subp.add_parser(
405
408
  "find", parents=[parent_parser], description="Search in a directory hierarchy."
406
409
  )
410
+ add_anon_arg(parse_find)
411
+ add_update_arg(parse_find)
407
412
  add_sources_arg(parse_find)
408
413
  parse_find.add_argument(
409
414
  "--name",
@@ -457,6 +462,8 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
457
462
  parse_index = subp.add_parser(
458
463
  "index", parents=[parent_parser], description="Index storage location."
459
464
  )
465
+ add_anon_arg(parse_index)
466
+ add_update_arg(parse_index)
460
467
  add_sources_arg(parse_index)
461
468
 
462
469
  show_parser = subp.add_parser(
@@ -480,6 +487,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
480
487
  parents=[parent_parser],
481
488
  description="Create a new dataset with a query script.",
482
489
  )
490
+ add_anon_arg(query_parser)
483
491
  query_parser.add_argument(
484
492
  "script", metavar="<script.py>", type=str, help="Filepath for script"
485
493
  )
@@ -504,14 +512,17 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
504
512
  help="Query parameters",
505
513
  )
506
514
 
507
- subp.add_parser(
515
+ parse_clear_cache = subp.add_parser(
508
516
  "clear-cache",
509
517
  parents=[parent_parser],
510
518
  description="Clear the local file cache.",
511
519
  )
512
- subp.add_parser(
520
+ add_anon_arg(parse_clear_cache)
521
+
522
+ parse_gc = subp.add_parser(
513
523
  "gc", parents=[parent_parser], description="Garbage collect temporary tables."
514
524
  )
525
+ add_anon_arg(parse_gc)
515
526
 
516
527
  subp.add_parser("internal-run-udf", parents=[parent_parser])
517
528
  subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
@@ -1,9 +1,8 @@
1
1
  def add_auth_parser(subparsers, parent_parser) -> None:
2
+ from dvc_studio_client.auth import AVAILABLE_SCOPES
3
+
2
4
  auth_help = "Manage Studio authentication"
3
- auth_description = (
4
- "Manage authentication and settings for Studio. "
5
- "Configure tokens for sharing datasets and using Studio features."
6
- )
5
+ auth_description = "Manage authentication and settings for Studio. "
7
6
 
8
7
  auth_parser = subparsers.add_parser(
9
8
  "auth",
@@ -19,8 +18,10 @@ def add_auth_parser(subparsers, parent_parser) -> None:
19
18
  auth_login_help = "Authenticate with Studio"
20
19
  auth_login_description = (
21
20
  "Authenticate with Studio using default scopes. "
22
- "A random name will be assigned as the token name if not specified."
21
+ "A random name will be assigned if the token name is not specified."
23
22
  )
23
+
24
+ allowed_scopes = ", ".join(AVAILABLE_SCOPES)
24
25
  login_parser = auth_subparser.add_parser(
25
26
  "login",
26
27
  parents=[parent_parser],
@@ -40,7 +41,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
40
41
  "--scopes",
41
42
  action="store",
42
43
  default=None,
43
- help="Authentication token scopes",
44
+ help=f"Authentication token scopes. Allowed scopes: {allowed_scopes}",
44
45
  )
45
46
 
46
47
  login_parser.add_argument(
@@ -34,6 +34,24 @@ def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Act
34
34
  )
35
35
 
36
36
 
37
+ def add_anon_arg(parser: ArgumentParser) -> None:
38
+ parser.add_argument(
39
+ "--anon",
40
+ action="store_true",
41
+ help="Use anonymous access to storage",
42
+ )
43
+
44
+
45
+ def add_update_arg(parser: ArgumentParser) -> None:
46
+ parser.add_argument(
47
+ "-u",
48
+ "--update",
49
+ action="count",
50
+ default=0,
51
+ help="Update cached list of files for the sources",
52
+ )
53
+
54
+
37
55
  def add_show_args(parser: ArgumentParser) -> None:
38
56
  parser.add_argument(
39
57
  "--limit",