PyPI - datachain - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

datachain 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show

datachain/cache.py +4 -2
datachain/catalog/catalog.py +100 -54
datachain/catalog/datasource.py +4 -6
datachain/cli/__init__.py +311 -0
datachain/cli/commands/__init__.py +29 -0
datachain/cli/commands/datasets.py +129 -0
datachain/cli/commands/du.py +14 -0
datachain/cli/commands/index.py +12 -0
datachain/cli/commands/ls.py +169 -0
datachain/cli/commands/misc.py +28 -0
datachain/cli/commands/query.py +53 -0
datachain/cli/commands/show.py +38 -0
datachain/cli/parser/__init__.py +547 -0
datachain/cli/parser/job.py +120 -0
datachain/cli/parser/studio.py +126 -0
datachain/cli/parser/utils.py +63 -0
datachain/{cli_utils.py → cli/utils.py} +27 -1
datachain/client/azure.py +21 -1
datachain/client/fsspec.py +45 -13
datachain/client/gcs.py +10 -2
datachain/client/local.py +4 -4
datachain/client/s3.py +10 -0
datachain/dataset.py +1 -0
datachain/func/__init__.py +2 -2
datachain/func/conditional.py +52 -0
datachain/func/func.py +5 -1
datachain/lib/arrow.py +4 -0
datachain/lib/dc.py +18 -3
datachain/lib/file.py +1 -1
datachain/lib/listing.py +36 -3
datachain/lib/signal_schema.py +89 -27
datachain/listing.py +1 -5
datachain/node.py +27 -1
datachain/progress.py +2 -2
datachain/query/session.py +1 -1
datachain/studio.py +58 -38
datachain/utils.py +1 -1
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA +6 -6
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/RECORD +43 -31
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/WHEEL +1 -1
datachain/cli.py +0 -1475
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/LICENSE +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/entry_points.txt +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/top_level.txt +0 -0

datachain/cache.py CHANGED Viewed

@@ -61,14 +61,16 @@ class DataChainCache:
         tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname())  # type: ignore[arg-type]
         size = file.size
         if size < 0:
-            size = await client.get_size(from_path)
+            size = await client.get_size(from_path, version_id=file.version)
         cb = callback or TqdmCallback(
             tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
             tqdm_cls=Tqdm,
             size=size,
         )
         try:
-            await client.get_file(from_path, tmp_info, callback=cb)
+            await client.get_file(
+                from_path, tmp_info, callback=cb, version_id=file.version
+            )
         finally:
             if not callback:
                 cb.close()

datachain/catalog/catalog.py CHANGED Viewed

@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
 class NodeGroup:
     """Class for a group of nodes from the same source"""
-    listing: "Listing"
+    listing: Optional["Listing"]
+    client: "Client"
     sources: list[DataSource]
     # The source path within the bucket
@@ -268,9 +269,7 @@ class NodeGroup:
         Download this node group to cache.
         """
         if self.sources:
-            self.listing.client.fetch_nodes(
-                self.iternodes(recursive), shared_progress_bar=pbar
-            )
+            self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
 def check_output_dataset_file(
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
     # Collect all sources to process
     for node_group in node_groups:
-        listing: Listing = node_group.listing
+        listing: Optional[Listing] = node_group.listing
         valid_sources: list[DataSource] = []
         for dsrc in node_group.sources:
             if dsrc.is_single_object():
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
                 total_files += 1
                 valid_sources.append(dsrc)
             else:
+                assert listing
                 node = dsrc.node
                 if not recursive:
                     print(f"{node.full_path} is a directory (not copied).")
@@ -433,37 +433,51 @@ def instantiate_node_groups(
     )
     output_dir = output
+    output_file = None
     if copy_to_filename:
         output_dir = os.path.dirname(output)
         if not output_dir:
             output_dir = "."
+        output_file = os.path.basename(output)
     # Instantiate these nodes
     for node_group in node_groups:
         if not node_group.sources:
             continue
-        listing: Listing = node_group.listing
+        listing: Optional[Listing] = node_group.listing
         source_path: str = node_group.source_path
         copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
-        instantiated_nodes = listing.collect_nodes_to_instantiate(
-            node_group.sources,
-            copy_to_filename,
-            recursive,
-            copy_dir_contents,
-            source_path,
-            node_group.is_edatachain,
-            node_group.is_dataset,
-        )
-        if not virtual_only:
-            listing.instantiate_nodes(
-                instantiated_nodes,
-                output_dir,
-                total_files,
-                force=force,
-                shared_progress_bar=instantiate_progress_bar,
+        if not listing:
+            source = node_group.sources[0]
+            client = source.client
+            node = NodeWithPath(source.node, [output_file or source.node.path])
+            instantiated_nodes = [node]
+            if not virtual_only:
+                node.instantiate(
+                    client, output_dir, instantiate_progress_bar, force=force
+                )
+        else:
+            instantiated_nodes = listing.collect_nodes_to_instantiate(
+                node_group.sources,
+                copy_to_filename,
+                recursive,
+                copy_dir_contents,
+                source_path,
+                node_group.is_edatachain,
+                node_group.is_dataset,
             )
+            if not virtual_only:
+                listing.instantiate_nodes(
+                    instantiated_nodes,
+                    output_dir,
+                    total_files,
+                    force=force,
+                    shared_progress_bar=instantiate_progress_bar,
+                )
         node_group.instantiated_nodes = instantiated_nodes
     if instantiate_progress_bar:
         instantiate_progress_bar.close()
@@ -592,7 +606,7 @@ class Catalog:
         client_config=None,
         object_name="file",
         skip_indexing=False,
-    ) -> tuple["Listing", str]:
+    ) -> tuple[Optional["Listing"], "Client", str]:
         from datachain.lib.dc import DataChain
         from datachain.listing import Listing
@@ -603,16 +617,19 @@ class Catalog:
         list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update
         )
+        lst = None
+        client = Client.get_client(list_uri, self.cache, **self.client_config)
+        if list_ds_name:
+            lst = Listing(
+                self.metastore.clone(),
+                self.warehouse.clone(),
+                client,
+                dataset_name=list_ds_name,
+                object_name=object_name,
+            )
-        lst = Listing(
-            self.metastore.clone(),
-            self.warehouse.clone(),
-            Client.get_client(list_uri, self.cache, **self.client_config),
-            dataset_name=list_ds_name,
-            object_name=object_name,
-        )
-        return lst, list_path
+        return lst, client, list_path
     def _remove_dataset_rows_and_warehouse_info(
         self, dataset: DatasetRecord, version: int, **kwargs
@@ -635,13 +652,13 @@ class Catalog:
     ) -> Optional[list["DataSource"]]:
         enlisted_sources = []
         for src in sources:  # Opt: parallel
-            listing, file_path = self.enlist_source(
+            listing, client, file_path = self.enlist_source(
                 src,
                 update,
                 client_config=client_config or self.client_config,
                 skip_indexing=skip_indexing,
             )
-            enlisted_sources.append((listing, file_path))
+            enlisted_sources.append((listing, client, file_path))
         if only_index:
             # sometimes we don't really need listing result (e.g on indexing process)
@@ -649,10 +666,16 @@ class Catalog:
             return None
         dsrc_all: list[DataSource] = []
-        for listing, file_path in enlisted_sources:
-            nodes = listing.expand_path(file_path)
-            dir_only = file_path.endswith("/")
-            dsrc_all.extend(DataSource(listing, node, dir_only) for node in nodes)
+        for listing, client, file_path in enlisted_sources:
+            if not listing:
+                nodes = [Node.from_file(client.get_file_info(file_path))]
+                dir_only = False
+            else:
+                nodes = listing.expand_path(file_path)
+                dir_only = file_path.endswith("/")
+            dsrc_all.extend(
+                DataSource(listing, client, node, dir_only) for node in nodes
+            )
         return dsrc_all
     def enlist_sources_grouped(
@@ -667,7 +690,7 @@ class Catalog:
         def _row_to_node(d: dict[str, Any]) -> Node:
             del d["file__source"]
-            return Node.from_dict(d)
+            return Node.from_row(d)
         enlisted_sources: list[tuple[bool, bool, Any]] = []
         client_config = client_config or self.client_config
@@ -677,7 +700,7 @@ class Catalog:
                 edatachain_data = parse_edatachain_file(src)
                 indexed_sources = []
                 for ds in edatachain_data:
-                    listing, source_path = self.enlist_source(
+                    listing, _, source_path = self.enlist_source(
                         ds["data-source"]["uri"],
                         update,
                         client_config=client_config,
@@ -701,6 +724,7 @@ class Catalog:
                     client = self.get_client(source, **client_config)
                     uri = client.uri
                     dataset_name, _, _, _ = get_listing(uri, self.session)
+                    assert dataset_name
                     listing = Listing(
                         self.metastore.clone(),
                         self.warehouse.clone(),
@@ -713,6 +737,7 @@ class Catalog:
                     indexed_sources.append(
                         (
                             listing,
+                            client,
                             source,
                             [_row_to_node(r) for r in rows],
                             ds_name,
@@ -722,25 +747,28 @@ class Catalog:
                 enlisted_sources.append((False, True, indexed_sources))
             else:
-                listing, source_path = self.enlist_source(
+                listing, client, source_path = self.enlist_source(
                     src, update, client_config=client_config
                 )
-                enlisted_sources.append((False, False, (listing, source_path)))
+                enlisted_sources.append((False, False, (listing, client, source_path)))
         node_groups = []
         for is_datachain, is_dataset, payload in enlisted_sources:  # Opt: parallel
             if is_dataset:
                 for (
                     listing,
+                    client,
                     source_path,
                     nodes,
                     dataset_name,
                     dataset_version,
                 ) in payload:
-                    dsrc = [DataSource(listing, node) for node in nodes]
+                    assert listing
+                    dsrc = [DataSource(listing, client, node) for node in nodes]
                     node_groups.append(
                         NodeGroup(
                             listing,
+                            client,
                             dsrc,
                             source_path,
                             dataset_name=dataset_name,
@@ -749,18 +777,30 @@ class Catalog:
                     )
             elif is_datachain:
                 for listing, source_path, paths in payload:
-                    dsrc = [DataSource(listing, listing.resolve_path(p)) for p in paths]
+                    assert listing
+                    dsrc = [
+                        DataSource(listing, listing.client, listing.resolve_path(p))
+                        for p in paths
+                    ]
                     node_groups.append(
-                        NodeGroup(listing, dsrc, source_path, is_edatachain=True)
+                        NodeGroup(
+                            listing,
+                            listing.client,
+                            dsrc,
+                            source_path,
+                            is_edatachain=True,
+                        )
                     )
             else:
-                listing, source_path = payload
-                as_container = source_path.endswith("/")
-                dsrc = [
-                    DataSource(listing, n, as_container)
-                    for n in listing.expand_path(source_path, use_glob=not no_glob)
-                ]
-                node_groups.append(NodeGroup(listing, dsrc, source_path))
+                listing, client, source_path = payload
+                if not listing:
+                    nodes = [Node.from_file(client.get_file_info(source_path))]
+                    as_container = False
+                else:
+                    as_container = source_path.endswith("/")
+                    nodes = listing.expand_path(source_path, use_glob=not no_glob)
+                dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
+                node_groups.append(NodeGroup(listing, client, dsrc, source_path))
         return node_groups
@@ -1196,10 +1236,16 @@ class Catalog:
         return q.to_db_records()
-    def signed_url(self, source: str, path: str, client_config=None) -> str:
+    def signed_url(
+        self,
+        source: str,
+        path: str,
+        version_id: Optional[str] = None,
+        client_config=None,
+    ) -> str:
         client_config = client_config or self.client_config
         client = Client.get_client(source, self.cache, **client_config)
-        return client.url(path)
+        return client.url(path, version_id=version_id)
     def export_dataset_table(
         self,

datachain/catalog/datasource.py CHANGED Viewed

@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
 class DataSource:
-    def __init__(self, listing, node, as_container=False):
+    def __init__(self, listing, client, node, as_container=False):
         self.listing = listing
+        self.client = client
         self.node = node
         self.as_container = (
             as_container  # Indicates whether a .tar file is handled as a container
         )
-    def get_full_path(self):
-        return self.get_node_full_path(self.node)
     def get_node_full_path(self, node):
-        return self.listing.client.get_full_path(node.full_path)
+        return self.client.get_full_path(node.full_path)
     def get_node_full_path_from_path(self, full_path):
-        return self.listing.client.get_full_path(full_path)
+        return self.client.get_full_path(full_path)
     def is_single_object(self):
         return self.node.dir_type == DirType.FILE or (

datachain/cli/__init__.py ADDED Viewed

@@ -0,0 +1,311 @@
+import logging
+import os
+import sys
+import traceback
+from multiprocessing import freeze_support
+from typing import Optional
+from datachain.cli.utils import get_logging_level
+from datachain.telemetry import telemetry
+from .commands import (
+    clear_cache,
+    completion,
+    dataset_stats,
+    du,
+    edit_dataset,
+    garbage_collect,
+    index,
+    list_datasets,
+    ls,
+    query,
+    rm_dataset,
+    show,
+)
+from .parser import get_parser
+logger = logging.getLogger("datachain")
+def main(argv: Optional[list[str]] = None) -> int:
+    from datachain.catalog import get_catalog
+    # Required for Windows multiprocessing support
+    freeze_support()
+    datachain_parser = get_parser()
+    args = datachain_parser.parse_args(argv)
+    if args.command in ("internal-run-udf", "internal-run-udf-worker"):
+        return handle_udf(args.command)
+    logger.addHandler(logging.StreamHandler())
+    logging_level = get_logging_level(args)
+    logger.setLevel(logging_level)
+    client_config = {
+        "aws_endpoint_url": args.aws_endpoint_url,
+        "anon": args.anon,
+    }
+    if args.debug_sql:
+        # This also sets this environment variable for any subprocesses
+        os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
+    error = None
+    try:
+        catalog = get_catalog(client_config=client_config)
+        return handle_command(args, catalog, client_config)
+    except BrokenPipeError as exc:
+        error, return_code = handle_broken_pipe_error(exc)
+        return return_code
+    except (KeyboardInterrupt, Exception) as exc:
+        error, return_code = handle_general_exception(exc, args, logging_level)
+        return return_code
+    finally:
+        telemetry.send_cli_call(args.command, error=error)
+def handle_command(args, catalog, client_config) -> int:
+    """Handle the different CLI commands."""
+    from datachain.studio import process_jobs_args, process_studio_cli_args
+    command_handlers = {
+        "cp": lambda: handle_cp_command(args, catalog),
+        "clone": lambda: handle_clone_command(args, catalog),
+        "dataset": lambda: handle_dataset_command(args, catalog),
+        "ds": lambda: handle_dataset_command(args, catalog),
+        "ls": lambda: handle_ls_command(args, client_config),
+        "show": lambda: handle_show_command(args, catalog),
+        "du": lambda: handle_du_command(args, catalog, client_config),
+        "find": lambda: handle_find_command(args, catalog),
+        "index": lambda: handle_index_command(args, catalog),
+        "completion": lambda: handle_completion_command(args),
+        "query": lambda: handle_query_command(args, catalog),
+        "clear-cache": lambda: clear_cache(catalog),
+        "gc": lambda: garbage_collect(catalog),
+        "studio": lambda: process_studio_cli_args(args),
+        "job": lambda: process_jobs_args(args),
+    }
+    handler = command_handlers.get(args.command)
+    if handler:
+        handler()
+        return 0
+    print(f"invalid command: {args.command}", file=sys.stderr)
+    return 1
+def handle_cp_command(args, catalog):
+    catalog.cp(
+        args.sources,
+        args.output,
+        force=bool(args.force),
+        update=bool(args.update),
+        recursive=bool(args.recursive),
+        edatachain_file=None,
+        edatachain_only=False,
+        no_edatachain_file=True,
+        no_glob=args.no_glob,
+    )
+def handle_clone_command(args, catalog):
+    catalog.clone(
+        args.sources,
+        args.output,
+        force=bool(args.force),
+        update=bool(args.update),
+        recursive=bool(args.recursive),
+        no_glob=args.no_glob,
+        no_cp=args.no_cp,
+        edatachain=args.edatachain,
+        edatachain_file=args.edatachain_file,
+    )
+def handle_dataset_command(args, catalog):
+    dataset_commands = {
+        "pull": lambda: catalog.pull_dataset(
+            args.dataset,
+            args.output,
+            local_ds_name=args.local_name,
+            local_ds_version=args.local_version,
+            cp=args.cp,
+            force=bool(args.force),
+            edatachain=args.edatachain,
+            edatachain_file=args.edatachain_file,
+        ),
+        "edit": lambda: edit_dataset(
+            catalog,
+            args.name,
+            new_name=args.new_name,
+            description=args.description,
+            labels=args.labels,
+            studio=args.studio,
+            local=args.local,
+            all=args.all,
+            team=args.team,
+        ),
+        "ls": lambda: list_datasets(
+            catalog=catalog,
+            studio=args.studio,
+            local=args.local,
+            all=args.all,
+            team=args.team,
+        ),
+        "rm": lambda: rm_dataset(
+            catalog,
+            args.name,
+            version=args.version,
+            force=args.force,
+            studio=args.studio,
+            local=args.local,
+            all=args.all,
+            team=args.team,
+        ),
+        "remove": lambda: rm_dataset(
+            catalog,
+            args.name,
+            version=args.version,
+            force=args.force,
+            studio=args.studio,
+            local=args.local,
+            all=args.all,
+            team=args.team,
+        ),
+        "stats": lambda: dataset_stats(
+            catalog,
+            args.name,
+            args.version,
+            show_bytes=args.bytes,
+            si=args.si,
+        ),
+    }
+    handler = dataset_commands.get(args.datasets_cmd)
+    if handler:
+        return handler()
+    raise Exception(f"Unexpected command {args.datasets_cmd}")
+def handle_ls_command(args, client_config):
+    ls(
+        args.sources,
+        long=bool(args.long),
+        studio=args.studio,
+        local=args.local,
+        all=args.all,
+        team=args.team,
+        update=bool(args.update),
+        client_config=client_config,
+    )
+def handle_show_command(args, catalog):
+    show(
+        catalog,
+        args.name,
+        args.version,
+        limit=args.limit,
+        offset=args.offset,
+        columns=args.columns,
+        no_collapse=args.no_collapse,
+        schema=args.schema,
+    )
+def handle_du_command(args, catalog, client_config):
+    du(
+        catalog,
+        args.sources,
+        show_bytes=args.bytes,
+        depth=args.depth,
+        si=args.si,
+        update=bool(args.update),
+        client_config=client_config,
+    )
+def handle_find_command(args, catalog):
+    results_found = False
+    for result in catalog.find(
+        args.sources,
+        update=bool(args.update),
+        names=args.name,
+        inames=args.iname,
+        paths=args.path,
+        ipaths=args.ipath,
+        size=args.size,
+        typ=args.type,
+        columns=args.columns,
+    ):
+        print(result)
+        results_found = True
+    if not results_found:
+        print("No results")
+def handle_index_command(args, catalog):
+    index(
+        catalog,
+        args.sources,
+        update=bool(args.update),
+    )
+def handle_completion_command(args):
+    print(completion(args.shell))
+def handle_query_command(args, catalog):
+    query(
+        catalog,
+        args.script,
+        parallel=args.parallel,
+        params=args.param,
+    )
+def handle_broken_pipe_error(exc):
+    # Python flushes standard streams on exit; redirect remaining output
+    # to devnull to avoid another BrokenPipeError at shutdown
+    # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
+    error = str(exc)
+    devnull = os.open(os.devnull, os.O_WRONLY)
+    os.dup2(devnull, sys.stdout.fileno())
+    return error, 141  # 128 + 13 (SIGPIPE)
+def handle_general_exception(exc, args, logging_level):
+    error = str(exc)
+    if isinstance(exc, KeyboardInterrupt):
+        msg = "Operation cancelled by the user"
+    else:
+        msg = str(exc)
+    print("Error:", msg, file=sys.stderr)
+    if logging_level <= logging.DEBUG:
+        traceback.print_exception(
+            type(exc),
+            exc,
+            exc.__traceback__,
+            file=sys.stderr,
+        )
+    if args.pdb:
+        import pdb  # noqa: T100
+        pdb.post_mortem()
+    return error, 1
+def handle_udf(command):
+    if command == "internal-run-udf":
+        from datachain.query.dispatch import udf_entrypoint
+        return udf_entrypoint()
+    if command == "internal-run-udf-worker":
+        from datachain.query.dispatch import udf_worker_entrypoint
+        return udf_worker_entrypoint()

datachain/cli/commands/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+from .datasets import (
+    dataset_stats,
+    edit_dataset,
+    list_datasets,
+    list_datasets_local,
+    rm_dataset,
+)
+from .du import du
+from .index import index
+from .ls import ls
+from .misc import clear_cache, completion, garbage_collect
+from .query import query
+from .show import show
+__all__ = [
+    "clear_cache",
+    "completion",
+    "dataset_stats",
+    "du",
+    "edit_dataset",
+    "garbage_collect",
+    "index",
+    "list_datasets",
+    "list_datasets_local",
+    "ls",
+    "query",
+    "rm_dataset",
+    "show",
+]

datachain 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

Potentially problematic release.

datachain 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl