PyPI - datachain - Versions diffs - 0.8.9__py3-none-any.whl → 0.8.11__py3-none-any.whl - Mend

datachain 0.8.9py3-none-any.whl → 0.8.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (38) hide show

datachain/cache.py +4 -4
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +102 -138
datachain/cli/__init__.py +9 -9
datachain/cli/parser/__init__.py +36 -20
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/studio.py +35 -34
datachain/cli/parser/utils.py +19 -1
datachain/cli/utils.py +1 -1
datachain/client/fsspec.py +11 -8
datachain/client/local.py +4 -4
datachain/data_storage/schema.py +1 -1
datachain/data_storage/sqlite.py +38 -7
datachain/data_storage/warehouse.py +2 -2
datachain/dataset.py +1 -1
datachain/error.py +12 -0
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +67 -23
datachain/func/func.py +17 -5
datachain/lib/convert/python_to_sql.py +15 -3
datachain/lib/dc.py +27 -5
datachain/lib/file.py +16 -0
datachain/lib/listing.py +30 -12
datachain/lib/pytorch.py +1 -1
datachain/lib/udf.py +1 -1
datachain/listing.py +1 -13
datachain/node.py +0 -15
datachain/nodes_fetcher.py +2 -2
datachain/query/dataset.py +8 -4
datachain/remote/studio.py +3 -3
datachain/sql/sqlite/base.py +35 -14
datachain/studio.py +8 -8
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/METADATA +3 -7
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/RECORD +38 -38
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/LICENSE +0 -0
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/WHEEL +0 -0
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/entry_points.txt +0 -0
{datachain-0.8.9.dist-info → datachain-0.8.11.dist-info}/top_level.txt +0 -0

datachain/cache.py CHANGED Viewed

@@ -22,15 +22,15 @@ def try_scandir(path):
         pass
-def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
+def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
     cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
-    return DataChainCache(cache_dir, tmp_dir=tmp_dir)
+    return Cache(cache_dir, tmp_dir=tmp_dir)
 @contextmanager
 def temporary_cache(
     tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
-) -> Iterator["DataChainCache"]:
+) -> Iterator["Cache"]:
     cache = get_temp_cache(tmp_dir, prefix=prefix)
     try:
         yield cache
@@ -39,7 +39,7 @@ def temporary_cache(
             cache.destroy()
-class DataChainCache:
+class Cache:
     def __init__(self, cache_dir: str, tmp_dir: str):
         self.odb = LocalHashFileDB(
             LocalFileSystem(),

datachain/catalog/__init__.py CHANGED Viewed

@@ -3,7 +3,6 @@ from .catalog import (
     QUERY_SCRIPT_CANCELED_EXIT_CODE,
     QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
     Catalog,
-    parse_edatachain_file,
 )
 from .loader import get_catalog
@@ -13,5 +12,4 @@ __all__ = [
     "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
     "Catalog",
     "get_catalog",
-    "parse_edatachain_file",
 ]

datachain/catalog/catalog.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import os.path
 import posixpath
+import signal
 import subprocess
 import sys
 import time
@@ -26,11 +27,10 @@ from uuid import uuid4
 import requests
 import sqlalchemy as sa
-import yaml
 from sqlalchemy import Column
 from tqdm.auto import tqdm
-from datachain.cache import DataChainCache
+from datachain.cache import Cache
 from datachain.client import Client
 from datachain.dataset import (
     DATASET_PREFIX,
@@ -57,7 +57,7 @@ from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
 from datachain.sql.types import DateTime, SQLType
-from datachain.utils import DataChainDir, datachain_paths_join
+from datachain.utils import DataChainDir
 from .datasource import DataSource
@@ -73,7 +73,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger("datachain")
 DEFAULT_DATASET_DIR = "dataset"
-DATASET_FILE_SUFFIX = ".edatachain"
 TTL_INT = 4 * 60 * 60
@@ -99,6 +98,47 @@ def noop(_: str):
     pass
+class TerminationSignal(RuntimeError):  # noqa: N818
+    def __init__(self, signal):
+        self.signal = signal
+        super().__init__("Received termination signal", signal)
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.signal})"
+if sys.platform == "win32":
+    SIGINT = signal.CTRL_C_EVENT
+else:
+    SIGINT = signal.SIGINT
+def shutdown_process(
+    proc: subprocess.Popen,
+    interrupt_timeout: Optional[int] = None,
+    terminate_timeout: Optional[int] = None,
+) -> int:
+    """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
+    logger.info("sending interrupt signal to the process %s", proc.pid)
+    proc.send_signal(SIGINT)
+    logger.info("waiting for the process %s to finish", proc.pid)
+    try:
+        return proc.wait(interrupt_timeout)
+    except subprocess.TimeoutExpired:
+        logger.info(
+            "timed out waiting, sending terminate signal to the process %s", proc.pid
+        )
+        proc.terminate()
+        try:
+            return proc.wait(terminate_timeout)
+        except subprocess.TimeoutExpired:
+            logger.info("timed out waiting, killing the process %s", proc.pid)
+            proc.kill()
+            return proc.wait()
 def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
     buffer = b""
     while byt := stream.read(1):  # Read one byte at a time
@@ -247,7 +287,6 @@ class NodeGroup:
     # The source path within the bucket
     # (not including the bucket name or s3:// prefix)
     source_path: str = ""
-    is_edatachain: bool = False
     dataset_name: Optional[str] = None
     dataset_version: Optional[int] = None
     instantiated_nodes: Optional[list[NodeWithPath]] = None
@@ -272,55 +311,11 @@ class NodeGroup:
             self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
-def check_output_dataset_file(
-    output: str,
-    force: bool = False,
-    dataset_filename: Optional[str] = None,
-    skip_check_edatachain: bool = False,
-) -> str:
-    """
-    Checks the dataset filename for existence or if it should be force-overwritten.
-    """
-    dataset_file = (
-        dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
-    )
-    if not skip_check_edatachain and os.path.exists(dataset_file):
-        if force:
-            os.remove(dataset_file)
-        else:
-            raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
-    return dataset_file
-def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
-    with open(filename, encoding="utf-8") as f:
-        contents = yaml.safe_load(f)
-    if not isinstance(contents, list):
-        contents = [contents]
-    for entry in contents:
-        if not isinstance(entry, dict):
-            raise TypeError(
-                "Failed parsing EDataChain file, "
-                "each data source entry must be a dictionary"
-            )
-        if "data-source" not in entry or "files" not in entry:
-            raise ValueError(
-                "Failed parsing EDataChain file, "
-                "each data source entry must contain the "
-                '"data-source" and "files" keys'
-            )
-    return contents
 def prepare_output_for_cp(
     node_groups: list[NodeGroup],
     output: str,
     force: bool = False,
-    edatachain_only: bool = False,
-    no_edatachain_file: bool = False,
+    no_cp: bool = False,
 ) -> tuple[bool, Optional[str]]:
     total_node_count = 0
     for node_group in node_groups:
@@ -333,7 +328,7 @@ def prepare_output_for_cp(
     always_copy_dir_contents = False
     copy_to_filename = None
-    if edatachain_only:
+    if no_cp:
         return always_copy_dir_contents, copy_to_filename
     if not os.path.isdir(output):
@@ -358,10 +353,6 @@ def prepare_output_for_cp(
                 copy_to_filename = output
         else:
             raise FileNotFoundError(f"Is not a directory: {output}")
-    if copy_to_filename and not no_edatachain_file:
-        raise RuntimeError("File to file cp not supported with .edatachain files!")
     return always_copy_dir_contents, copy_to_filename
@@ -465,8 +456,6 @@ def instantiate_node_groups(
                 copy_to_filename,
                 recursive,
                 copy_dir_contents,
-                source_path,
-                node_group.is_edatachain,
                 node_group.is_dataset,
             )
             if not virtual_only:
@@ -484,24 +473,6 @@ def instantiate_node_groups(
         instantiate_progress_bar.close()
-def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
-    metafile_data = []
-    for node_group in node_groups:
-        if not node_group.sources:
-            continue
-        listing: Listing = node_group.listing
-        metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
-        for node in node_group.instantiated_nodes:
-            if not node.n.is_dir:
-                metafile_group["files"].append(  # type: ignore [attr-defined]
-                    node.get_metafile_data()
-                )
-        if metafile_group["files"]:
-            metafile_data.append(metafile_group)
-    return metafile_data
 def find_column_to_str(  # noqa: PLR0911
     row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
 ) -> str:
@@ -536,7 +507,7 @@ def find_column_to_str(  # noqa: PLR0911
     return ""
-def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
+def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
     clone = catalog.copy()
     clone.cache = cache
     return clone
@@ -559,7 +530,7 @@ class Catalog:
         datachain_dir.init()
         self.metastore = metastore
         self._warehouse = warehouse
-        self.cache = DataChainCache(datachain_dir.cache, datachain_dir.tmp)
+        self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
         self.client_config = client_config if client_config is not None else {}
         self._init_params = {
             "cache_dir": cache_dir,
@@ -703,22 +674,8 @@ class Catalog:
         enlisted_sources: list[tuple[bool, bool, Any]] = []
         client_config = client_config or self.client_config
         for src in sources:  # Opt: parallel
-            if src.endswith(DATASET_FILE_SUFFIX) and os.path.isfile(src):
-                # TODO: Also allow using EDataChain files from cloud locations?
-                edatachain_data = parse_edatachain_file(src)
-                indexed_sources = []
-                for ds in edatachain_data:
-                    listing, _, source_path = self.enlist_source(
-                        ds["data-source"]["uri"],
-                        update,
-                        client_config=client_config,
-                    )
-                    paths = datachain_paths_join(
-                        source_path, (f["name"] for f in ds["files"])
-                    )
-                    indexed_sources.append((listing, source_path, paths))
-                enlisted_sources.append((True, False, indexed_sources))
-            elif src.startswith("ds://"):
+            listing: Optional[Listing]
+            if src.startswith("ds://"):
                 ds_name, ds_version = parse_dataset_uri(src)
                 dataset = self.get_dataset(ds_name)
                 if not ds_version:
@@ -796,7 +753,6 @@ class Catalog:
                             listing.client,
                             dsrc,
                             source_path,
-                            is_edatachain=True,
                         )
                     )
             else:
@@ -1360,8 +1316,6 @@ class Catalog:
         local_ds_version: Optional[int] = None,
         cp: bool = False,
         force: bool = False,
-        edatachain: bool = False,
-        edatachain_file: Optional[str] = None,
         *,
         client_config=None,
     ) -> None:
@@ -1373,8 +1327,6 @@ class Catalog:
                 [ds_uri],
                 output,
                 force=force,
-                no_edatachain_file=not edatachain,
-                edatachain_file=edatachain_file,
                 client_config=client_config,
             )
             print(f"Dataset {ds_uri} instantiated locally to {output}")
@@ -1541,8 +1493,6 @@ class Catalog:
         recursive: bool = False,
         no_glob: bool = False,
         no_cp: bool = False,
-        edatachain: bool = False,
-        edatachain_file: Optional[str] = None,
         *,
         client_config=None,
     ) -> None:
@@ -1551,9 +1501,8 @@ class Catalog:
         them into the dataset folder.
         It also adds those files to a dataset in database, which is
         created if doesn't exist yet
-        Optionally, it creates a .edatachain file
         """
-        if not no_cp or edatachain:
+        if not no_cp:
             self.cp(
                 sources,
                 output,
@@ -1561,9 +1510,7 @@ class Catalog:
                 update=update,
                 recursive=recursive,
                 no_glob=no_glob,
-                edatachain_only=no_cp,
-                no_edatachain_file=not edatachain,
-                edatachain_file=edatachain_file,
+                no_cp=no_cp,
                 client_config=client_config,
             )
         else:
@@ -1588,6 +1535,8 @@ class Catalog:
         output_hook: Callable[[str], None] = noop,
         params: Optional[dict[str, str]] = None,
         job_id: Optional[str] = None,
+        interrupt_timeout: Optional[int] = None,
+        terminate_timeout: Optional[int] = None,
     ) -> None:
         cmd = [python_executable, "-c", query_script]
         env = dict(env or os.environ)
@@ -1601,13 +1550,48 @@ class Catalog:
         if capture_output:
             popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
+        def raise_termination_signal(sig: int, _: Any) -> NoReturn:
+            raise TerminationSignal(sig)
+        thread: Optional[Thread] = None
         with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # noqa: S603
-            if capture_output:
-                args = (proc.stdout, output_hook)
-                thread = Thread(target=_process_stream, args=args, daemon=True)
-                thread.start()
-                thread.join()  # wait for the reader thread
+            logger.info("Starting process %s", proc.pid)
+            orig_sigint_handler = signal.getsignal(signal.SIGINT)
+            # ignore SIGINT in the main process.
+            # In the terminal, SIGINTs are received by all the processes in
+            # the foreground process group, so the script will receive the signal too.
+            # (If we forward the signal to the child, it will receive it twice.)
+            signal.signal(signal.SIGINT, signal.SIG_IGN)
+            orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
+            signal.signal(signal.SIGTERM, raise_termination_signal)
+            try:
+                if capture_output:
+                    args = (proc.stdout, output_hook)
+                    thread = Thread(target=_process_stream, args=args, daemon=True)
+                    thread.start()
+                proc.wait()
+            except TerminationSignal as exc:
+                signal.signal(signal.SIGTERM, orig_sigterm_handler)
+                signal.signal(signal.SIGINT, orig_sigint_handler)
+                logging.info("Shutting down process %s, received %r", proc.pid, exc)
+                # Rather than forwarding the signal to the child, we try to shut it down
+                # gracefully. This is because we consider the script to be interactive
+                # and special, so we give it time to cleanup before exiting.
+                shutdown_process(proc, interrupt_timeout, terminate_timeout)
+                if proc.returncode:
+                    raise QueryScriptCancelError(
+                        "Query script was canceled by user", return_code=proc.returncode
+                    ) from exc
+            finally:
+                signal.signal(signal.SIGTERM, orig_sigterm_handler)
+                signal.signal(signal.SIGINT, orig_sigint_handler)
+                if thread:
+                    thread.join()  # wait for the reader thread
+        logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
         if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
             raise QueryScriptCancelError(
                 "Query script was canceled by user",
@@ -1626,17 +1610,14 @@ class Catalog:
         force: bool = False,
         update: bool = False,
         recursive: bool = False,
-        edatachain_file: Optional[str] = None,
-        edatachain_only: bool = False,
-        no_edatachain_file: bool = False,
+        no_cp: bool = False,
         no_glob: bool = False,
         *,
-        client_config=None,
-    ) -> list[dict[str, Any]]:
+        client_config: Optional["dict"] = None,
+    ) -> None:
         """
         This function copies files from cloud sources to local destination directory
         If cloud source is not indexed, or has expired index, it runs indexing
-        It also creates .edatachain file by default, if not specified differently
         """
         client_config = client_config or self.client_config
         node_groups = self.enlist_sources_grouped(
@@ -1647,17 +1628,11 @@ class Catalog:
         )
         always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
-            node_groups, output, force, edatachain_only, no_edatachain_file
+            node_groups, output, force, no_cp
         )
-        dataset_file = check_output_dataset_file(
-            output, force, edatachain_file, no_edatachain_file
-        )
         total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
-        if total_files == 0:
-            # Nothing selected to cp
-            return []
+        if not total_files:
+            return
         desc_max_len = max(len(output) + 16, 19)
         bar_format = (
@@ -1667,7 +1642,7 @@ class Catalog:
             "[{elapsed}<{remaining}, {rate_fmt:>8}]"
         )
-        if not edatachain_only:
+        if not no_cp:
             with get_download_bar(bar_format, total_size) as pbar:
                 for node_group in node_groups:
                     node_group.download(recursive=recursive, pbar=pbar)
@@ -1679,21 +1654,10 @@ class Catalog:
             total_files,
             force,
             recursive,
-            edatachain_only,
+            no_cp,
             always_copy_dir_contents,
             copy_to_filename,
         )
-        if no_edatachain_file:
-            return []
-        metafile_data = compute_metafile_data(node_groups)
-        if metafile_data:
-            # Don't write the metafile if nothing was copied
-            print(f"Creating '{dataset_file}'")
-            with open(dataset_file, "w", encoding="utf-8") as fd:
-                yaml.dump(metafile_data, fd, sort_keys=False)
-        return metafile_data
     def du(
         self,

datachain/cli/__init__.py CHANGED Viewed

@@ -47,10 +47,13 @@ def main(argv: Optional[list[str]] = None) -> int:
     logging_level = get_logging_level(args)
     logger.setLevel(logging_level)
-    client_config = {
-        "aws_endpoint_url": args.aws_endpoint_url,
-        "anon": args.anon,
-    }
+    client_config = (
+        {
+            "anon": args.anon,
+        }
+        if getattr(args, "anon", False)
+        else {}
+    )
     if args.debug_sql:
         # This also sets this environment variable for any subprocesses
@@ -73,7 +76,7 @@ def main(argv: Optional[list[str]] = None) -> int:
 def handle_command(args, catalog, client_config) -> int:
     """Handle the different CLI commands."""
-    from datachain.studio import process_jobs_args, process_studio_cli_args
+    from datachain.studio import process_auth_cli_args, process_jobs_args
     command_handlers = {
         "cp": lambda: handle_cp_command(args, catalog),
@@ -89,7 +92,7 @@ def handle_command(args, catalog, client_config) -> int:
         "query": lambda: handle_query_command(args, catalog),
         "clear-cache": lambda: clear_cache(catalog),
         "gc": lambda: garbage_collect(catalog),
-        "studio": lambda: process_studio_cli_args(args),
+        "auth": lambda: process_auth_cli_args(args),
         "job": lambda: process_jobs_args(args),
     }
@@ -108,9 +111,6 @@ def handle_cp_command(args, catalog):
         force=bool(args.force),
         update=bool(args.update),
         recursive=bool(args.recursive),
-        edatachain_file=None,
-        edatachain_only=False,
-        no_edatachain_file=True,
         no_glob=args.no_glob,
     )

datachain/cli/parser/__init__.py CHANGED Viewed

@@ -7,8 +7,15 @@ import shtab
 from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
 from .job import add_jobs_parser
-from .studio import add_studio_parser
-from .utils import FIND_COLUMNS, add_show_args, add_sources_arg, find_columns_type
+from .studio import add_auth_parser
+from .utils import (
+    FIND_COLUMNS,
+    add_anon_arg,
+    add_show_args,
+    add_sources_arg,
+    add_update_arg,
+    find_columns_type,
+)
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
@@ -25,25 +32,13 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parser.add_argument("-V", "--version", action="version", version=__version__)
     parent_parser = ArgumentParser(add_help=False)
-    parent_parser.add_argument(
-        "--aws-endpoint-url",
-        type=str,
-        help="AWS endpoint URL",
-    )
-    parent_parser.add_argument(
-        "--anon",
-        action="store_true",
-        help="anon flag for remote storage (like awscli's --no-sign-request)",
-    )
-    parent_parser.add_argument(
-        "-u", "--update", action="count", default=0, help="Update cache"
-    )
     parent_parser.add_argument(
         "-v", "--verbose", action="count", default=0, help="Be verbose"
     )
     parent_parser.add_argument(
         "-q", "--quiet", action="count", default=0, help="Be quiet"
     )
     parent_parser.add_argument(
         "--debug-sql",
         action="store_true",
@@ -67,7 +62,9 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "cp", parents=[parent_parser], description="Copy data files from the cloud."
     )
     add_sources_arg(parse_cp).complete = shtab.DIR  # type: ignore[attr-defined]
-    parse_cp.add_argument("output", type=str, help="Output")
+    parse_cp.add_argument(
+        "output", type=str, help="Path to a directory or file to put data to"
+    )
     parse_cp.add_argument(
         "-f",
         "--force",
@@ -89,12 +86,16 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action="store_true",
         help="Do not expand globs (such as * or ?)",
     )
+    add_anon_arg(parse_cp)
+    add_update_arg(parse_cp)
     parse_clone = subp.add_parser(
         "clone", parents=[parent_parser], description="Copy data files from the cloud."
     )
     add_sources_arg(parse_clone).complete = shtab.DIR  # type: ignore[attr-defined]
-    parse_clone.add_argument("output", type=str, help="Output")
+    parse_clone.add_argument(
+        "output", type=str, help="Path to a directory or file to put data to"
+    )
     parse_clone.add_argument(
         "-f",
         "--force",
@@ -122,8 +123,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action="store_true",
         help="Do not copy files, just create a dataset",
     )
+    add_anon_arg(parse_clone)
+    add_update_arg(parse_clone)
-    add_studio_parser(subp, parent_parser)
+    add_auth_parser(subp, parent_parser)
     add_jobs_parser(subp, parent_parser)
     datasets_parser = subp.add_parser(
@@ -132,6 +135,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         parents=[parent_parser],
         description="Commands for managing datasets.",
     )
+    add_anon_arg(datasets_parser)
     datasets_subparser = datasets_parser.add_subparsers(
         dest="datasets_cmd",
         help="Use `datachain dataset CMD --help` to display command-specific help",
@@ -331,6 +335,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_ls = subp.add_parser(
         "ls", parents=[parent_parser], description="List storage contents."
     )
+    add_anon_arg(parse_ls)
+    add_update_arg(parse_ls)
     add_sources_arg(parse_ls, nargs="*")
     parse_ls.add_argument(
         "-l",
@@ -370,6 +376,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "du", parents=[parent_parser], description="Display space usage."
     )
     add_sources_arg(parse_du)
+    add_anon_arg(parse_du)
+    add_update_arg(parse_du)
     parse_du.add_argument(
         "-b",
         "--bytes",
@@ -399,6 +407,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_find = subp.add_parser(
         "find", parents=[parent_parser], description="Search in a directory hierarchy."
     )
+    add_anon_arg(parse_find)
+    add_update_arg(parse_find)
     add_sources_arg(parse_find)
     parse_find.add_argument(
         "--name",
@@ -452,6 +462,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_index = subp.add_parser(
         "index", parents=[parent_parser], description="Index storage location."
     )
+    add_anon_arg(parse_index)
+    add_update_arg(parse_index)
     add_sources_arg(parse_index)
     show_parser = subp.add_parser(
@@ -475,6 +487,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         parents=[parent_parser],
         description="Create a new dataset with a query script.",
     )
+    add_anon_arg(query_parser)
     query_parser.add_argument(
         "script", metavar="<script.py>", type=str, help="Filepath for script"
     )
@@ -499,14 +512,17 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Query parameters",
     )
-    subp.add_parser(
+    parse_clear_cache = subp.add_parser(
         "clear-cache",
         parents=[parent_parser],
         description="Clear the local file cache.",
     )
-    subp.add_parser(
+    add_anon_arg(parse_clear_cache)
+    parse_gc = subp.add_parser(
         "gc", parents=[parent_parser], description="Garbage collect temporary tables."
     )
+    add_anon_arg(parse_gc)
     subp.add_parser("internal-run-udf", parents=[parent_parser])
     subp.add_parser("internal-run-udf-worker", parents=[parent_parser])

datachain/cli/parser/job.py CHANGED Viewed

@@ -6,7 +6,7 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
     )
     jobs_subparser = jobs_parser.add_subparsers(
         dest="cmd",
-        help="Use `datachain studio CMD --help` to display command-specific help",
+        help="Use `datachain auth CMD --help` to display command-specific help",
     )
     studio_run_help = "Run a job in Studio"

datachain 0.8.9__py3-none-any.whl → 0.8.11__py3-none-any.whl

Potentially problematic release.

datachain 0.8.9py3-none-any.whl → 0.8.11py3-none-any.whl