PyPI - datachain - Versions diffs - 0.8.10__py3-none-any.whl → 0.8.12__py3-none-any.whl - Mend

datachain 0.8.10py3-none-any.whl → 0.8.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (33) hide show

datachain/cache.py +4 -4
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +103 -158
datachain/cli/__init__.py +7 -14
datachain/cli/commands/__init__.py +0 -2
datachain/cli/commands/datasets.py +0 -19
datachain/cli/parser/__init__.py +27 -41
datachain/cli/parser/studio.py +7 -6
datachain/cli/parser/utils.py +18 -0
datachain/client/fsspec.py +11 -8
datachain/client/local.py +4 -4
datachain/data_storage/schema.py +1 -1
datachain/dataset.py +1 -7
datachain/error.py +12 -0
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +77 -26
datachain/func/func.py +17 -6
datachain/lib/dc.py +24 -4
datachain/lib/file.py +16 -0
datachain/lib/listing.py +30 -12
datachain/lib/pytorch.py +1 -1
datachain/lib/udf.py +1 -1
datachain/listing.py +1 -13
datachain/node.py +0 -15
datachain/nodes_fetcher.py +2 -2
datachain/remote/studio.py +2 -14
datachain/studio.py +1 -1
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/METADATA +3 -7
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/RECORD +33 -33
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/LICENSE +0 -0
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/WHEEL +0 -0
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/entry_points.txt +0 -0
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/top_level.txt +0 -0

datachain/cache.py CHANGED Viewed

@@ -22,15 +22,15 @@ def try_scandir(path):
         pass
-def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
+def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
     cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
-    return DataChainCache(cache_dir, tmp_dir=tmp_dir)
+    return Cache(cache_dir, tmp_dir=tmp_dir)
 @contextmanager
 def temporary_cache(
     tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
-) -> Iterator["DataChainCache"]:
+) -> Iterator["Cache"]:
     cache = get_temp_cache(tmp_dir, prefix=prefix)
     try:
         yield cache
@@ -39,7 +39,7 @@ def temporary_cache(
             cache.destroy()
-class DataChainCache:
+class Cache:
     def __init__(self, cache_dir: str, tmp_dir: str):
         self.odb = LocalHashFileDB(
             LocalFileSystem(),

datachain/catalog/__init__.py CHANGED Viewed

@@ -3,7 +3,6 @@ from .catalog import (
     QUERY_SCRIPT_CANCELED_EXIT_CODE,
     QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
     Catalog,
-    parse_edatachain_file,
 )
 from .loader import get_catalog
@@ -13,5 +12,4 @@ __all__ = [
     "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
     "Catalog",
     "get_catalog",
-    "parse_edatachain_file",
 ]

datachain/catalog/catalog.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import os.path
 import posixpath
+import signal
 import subprocess
 import sys
 import time
@@ -26,11 +27,10 @@ from uuid import uuid4
 import requests
 import sqlalchemy as sa
-import yaml
 from sqlalchemy import Column
 from tqdm.auto import tqdm
-from datachain.cache import DataChainCache
+from datachain.cache import Cache
 from datachain.client import Client
 from datachain.dataset import (
     DATASET_PREFIX,
@@ -38,7 +38,6 @@ from datachain.dataset import (
     DatasetDependency,
     DatasetListRecord,
     DatasetRecord,
-    DatasetStats,
     DatasetStatus,
     StorageURI,
     create_dataset_uri,
@@ -57,7 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
 from datachain.sql.types import DateTime, SQLType
-from datachain.utils import DataChainDir, datachain_paths_join
+from datachain.utils import DataChainDir
 from .datasource import DataSource
@@ -73,7 +72,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger("datachain")
 DEFAULT_DATASET_DIR = "dataset"
-DATASET_FILE_SUFFIX = ".edatachain"
 TTL_INT = 4 * 60 * 60
@@ -99,6 +97,47 @@ def noop(_: str):
     pass
+class TerminationSignal(RuntimeError):  # noqa: N818
+    def __init__(self, signal):
+        self.signal = signal
+        super().__init__("Received termination signal", signal)
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.signal})"
+if sys.platform == "win32":
+    SIGINT = signal.CTRL_C_EVENT
+else:
+    SIGINT = signal.SIGINT
+def shutdown_process(
+    proc: subprocess.Popen,
+    interrupt_timeout: Optional[int] = None,
+    terminate_timeout: Optional[int] = None,
+) -> int:
+    """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
+    logger.info("sending interrupt signal to the process %s", proc.pid)
+    proc.send_signal(SIGINT)
+    logger.info("waiting for the process %s to finish", proc.pid)
+    try:
+        return proc.wait(interrupt_timeout)
+    except subprocess.TimeoutExpired:
+        logger.info(
+            "timed out waiting, sending terminate signal to the process %s", proc.pid
+        )
+        proc.terminate()
+        try:
+            return proc.wait(terminate_timeout)
+        except subprocess.TimeoutExpired:
+            logger.info("timed out waiting, killing the process %s", proc.pid)
+            proc.kill()
+            return proc.wait()
 def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
     buffer = b""
     while byt := stream.read(1):  # Read one byte at a time
@@ -247,7 +286,6 @@ class NodeGroup:
     # The source path within the bucket
     # (not including the bucket name or s3:// prefix)
     source_path: str = ""
-    is_edatachain: bool = False
     dataset_name: Optional[str] = None
     dataset_version: Optional[int] = None
     instantiated_nodes: Optional[list[NodeWithPath]] = None
@@ -272,55 +310,11 @@ class NodeGroup:
             self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
-def check_output_dataset_file(
-    output: str,
-    force: bool = False,
-    dataset_filename: Optional[str] = None,
-    skip_check_edatachain: bool = False,
-) -> str:
-    """
-    Checks the dataset filename for existence or if it should be force-overwritten.
-    """
-    dataset_file = (
-        dataset_filename if dataset_filename else output + DATASET_FILE_SUFFIX
-    )
-    if not skip_check_edatachain and os.path.exists(dataset_file):
-        if force:
-            os.remove(dataset_file)
-        else:
-            raise RuntimeError(f"Output dataset file already exists: {dataset_file}")
-    return dataset_file
-def parse_edatachain_file(filename: str) -> list[dict[str, Any]]:
-    with open(filename, encoding="utf-8") as f:
-        contents = yaml.safe_load(f)
-    if not isinstance(contents, list):
-        contents = [contents]
-    for entry in contents:
-        if not isinstance(entry, dict):
-            raise TypeError(
-                "Failed parsing EDataChain file, "
-                "each data source entry must be a dictionary"
-            )
-        if "data-source" not in entry or "files" not in entry:
-            raise ValueError(
-                "Failed parsing EDataChain file, "
-                "each data source entry must contain the "
-                '"data-source" and "files" keys'
-            )
-    return contents
 def prepare_output_for_cp(
     node_groups: list[NodeGroup],
     output: str,
     force: bool = False,
-    edatachain_only: bool = False,
-    no_edatachain_file: bool = False,
+    no_cp: bool = False,
 ) -> tuple[bool, Optional[str]]:
     total_node_count = 0
     for node_group in node_groups:
@@ -333,7 +327,7 @@ def prepare_output_for_cp(
     always_copy_dir_contents = False
     copy_to_filename = None
-    if edatachain_only:
+    if no_cp:
         return always_copy_dir_contents, copy_to_filename
     if not os.path.isdir(output):
@@ -358,10 +352,6 @@ def prepare_output_for_cp(
                 copy_to_filename = output
         else:
             raise FileNotFoundError(f"Is not a directory: {output}")
-    if copy_to_filename and not no_edatachain_file:
-        raise RuntimeError("File to file cp not supported with .edatachain files!")
     return always_copy_dir_contents, copy_to_filename
@@ -465,8 +455,6 @@ def instantiate_node_groups(
                 copy_to_filename,
                 recursive,
                 copy_dir_contents,
-                source_path,
-                node_group.is_edatachain,
                 node_group.is_dataset,
             )
             if not virtual_only:
@@ -484,24 +472,6 @@ def instantiate_node_groups(
         instantiate_progress_bar.close()
-def compute_metafile_data(node_groups) -> list[dict[str, Any]]:
-    metafile_data = []
-    for node_group in node_groups:
-        if not node_group.sources:
-            continue
-        listing: Listing = node_group.listing
-        metafile_group = {"data-source": {"uri": listing.uri}, "files": []}
-        for node in node_group.instantiated_nodes:
-            if not node.n.is_dir:
-                metafile_group["files"].append(  # type: ignore [attr-defined]
-                    node.get_metafile_data()
-                )
-        if metafile_group["files"]:
-            metafile_data.append(metafile_group)
-    return metafile_data
 def find_column_to_str(  # noqa: PLR0911
     row: tuple[Any, ...], field_lookup: dict[str, int], src: DataSource, column: str
 ) -> str:
@@ -536,7 +506,7 @@ def find_column_to_str(  # noqa: PLR0911
     return ""
-def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
+def clone_catalog_with_cache(catalog: "Catalog", cache: "Cache") -> "Catalog":
     clone = catalog.copy()
     clone.cache = cache
     return clone
@@ -559,7 +529,7 @@ class Catalog:
         datachain_dir.init()
         self.metastore = metastore
         self._warehouse = warehouse
-        self.cache = DataChainCache(datachain_dir.cache, datachain_dir.tmp)
+        self.cache = Cache(datachain_dir.cache, datachain_dir.tmp)
         self.client_config = client_config if client_config is not None else {}
         self._init_params = {
             "cache_dir": cache_dir,
@@ -703,22 +673,8 @@ class Catalog:
         enlisted_sources: list[tuple[bool, bool, Any]] = []
         client_config = client_config or self.client_config
         for src in sources:  # Opt: parallel
-            if src.endswith(DATASET_FILE_SUFFIX) and os.path.isfile(src):
-                # TODO: Also allow using EDataChain files from cloud locations?
-                edatachain_data = parse_edatachain_file(src)
-                indexed_sources = []
-                for ds in edatachain_data:
-                    listing, _, source_path = self.enlist_source(
-                        ds["data-source"]["uri"],
-                        update,
-                        client_config=client_config,
-                    )
-                    paths = datachain_paths_join(
-                        source_path, (f["name"] for f in ds["files"])
-                    )
-                    indexed_sources.append((listing, source_path, paths))
-                enlisted_sources.append((True, False, indexed_sources))
-            elif src.startswith("ds://"):
+            listing: Optional[Listing]
+            if src.startswith("ds://"):
                 ds_name, ds_version = parse_dataset_uri(src)
                 dataset = self.get_dataset(ds_name)
                 if not ds_version:
@@ -796,7 +752,6 @@ class Catalog:
                             listing.client,
                             dsrc,
                             source_path,
-                            is_edatachain=True,
                         )
                     )
             else:
@@ -1279,17 +1234,6 @@ class Catalog:
         dataset = self.get_dataset(name)
         return self.warehouse.dataset_table_export_file_names(dataset, version)
-    def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
-        """
-        Returns tuple with dataset stats: total number of rows and total dataset size.
-        """
-        dataset = self.get_dataset(name)
-        dataset_version = dataset.get_version(version or dataset.latest_version)
-        return DatasetStats(
-            num_objects=dataset_version.num_objects,
-            size=dataset_version.size,
-        )
     def remove_dataset(
         self,
         name: str,
@@ -1360,8 +1304,6 @@ class Catalog:
         local_ds_version: Optional[int] = None,
         cp: bool = False,
         force: bool = False,
-        edatachain: bool = False,
-        edatachain_file: Optional[str] = None,
         *,
         client_config=None,
     ) -> None:
@@ -1373,8 +1315,6 @@ class Catalog:
                 [ds_uri],
                 output,
                 force=force,
-                no_edatachain_file=not edatachain,
-                edatachain_file=edatachain_file,
                 client_config=client_config,
             )
             print(f"Dataset {ds_uri} instantiated locally to {output}")
@@ -1439,19 +1379,12 @@ class Catalog:
         except DatasetNotFoundError:
             pass
-        stats_response = studio_client.dataset_stats(
-            remote_ds_name, remote_ds_version.version
-        )
-        if not stats_response.ok:
-            raise_remote_error(stats_response.message)
-        ds_stats = stats_response.data
         dataset_save_progress_bar = tqdm(
             desc=f"Saving dataset {remote_ds_uri} locally: ",
             unit=" rows",
             unit_scale=True,
             unit_divisor=1000,
-            total=ds_stats.num_objects,  # type: ignore [union-attr]
+            total=remote_ds_version.num_objects,  # type: ignore [union-attr]
             leave=False,
         )
@@ -1541,8 +1474,6 @@ class Catalog:
         recursive: bool = False,
         no_glob: bool = False,
         no_cp: bool = False,
-        edatachain: bool = False,
-        edatachain_file: Optional[str] = None,
         *,
         client_config=None,
     ) -> None:
@@ -1551,9 +1482,8 @@ class Catalog:
         them into the dataset folder.
         It also adds those files to a dataset in database, which is
         created if doesn't exist yet
-        Optionally, it creates a .edatachain file
         """
-        if not no_cp or edatachain:
+        if not no_cp:
             self.cp(
                 sources,
                 output,
@@ -1561,9 +1491,7 @@ class Catalog:
                 update=update,
                 recursive=recursive,
                 no_glob=no_glob,
-                edatachain_only=no_cp,
-                no_edatachain_file=not edatachain,
-                edatachain_file=edatachain_file,
+                no_cp=no_cp,
                 client_config=client_config,
             )
         else:
@@ -1588,6 +1516,8 @@ class Catalog:
         output_hook: Callable[[str], None] = noop,
         params: Optional[dict[str, str]] = None,
         job_id: Optional[str] = None,
+        interrupt_timeout: Optional[int] = None,
+        terminate_timeout: Optional[int] = None,
     ) -> None:
         cmd = [python_executable, "-c", query_script]
         env = dict(env or os.environ)
@@ -1601,13 +1531,48 @@ class Catalog:
         if capture_output:
             popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
+        def raise_termination_signal(sig: int, _: Any) -> NoReturn:
+            raise TerminationSignal(sig)
+        thread: Optional[Thread] = None
         with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # noqa: S603
-            if capture_output:
-                args = (proc.stdout, output_hook)
-                thread = Thread(target=_process_stream, args=args, daemon=True)
-                thread.start()
-                thread.join()  # wait for the reader thread
+            logger.info("Starting process %s", proc.pid)
+            orig_sigint_handler = signal.getsignal(signal.SIGINT)
+            # ignore SIGINT in the main process.
+            # In the terminal, SIGINTs are received by all the processes in
+            # the foreground process group, so the script will receive the signal too.
+            # (If we forward the signal to the child, it will receive it twice.)
+            signal.signal(signal.SIGINT, signal.SIG_IGN)
+            orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
+            signal.signal(signal.SIGTERM, raise_termination_signal)
+            try:
+                if capture_output:
+                    args = (proc.stdout, output_hook)
+                    thread = Thread(target=_process_stream, args=args, daemon=True)
+                    thread.start()
+                proc.wait()
+            except TerminationSignal as exc:
+                signal.signal(signal.SIGTERM, orig_sigterm_handler)
+                signal.signal(signal.SIGINT, orig_sigint_handler)
+                logging.info("Shutting down process %s, received %r", proc.pid, exc)
+                # Rather than forwarding the signal to the child, we try to shut it down
+                # gracefully. This is because we consider the script to be interactive
+                # and special, so we give it time to cleanup before exiting.
+                shutdown_process(proc, interrupt_timeout, terminate_timeout)
+                if proc.returncode:
+                    raise QueryScriptCancelError(
+                        "Query script was canceled by user", return_code=proc.returncode
+                    ) from exc
+            finally:
+                signal.signal(signal.SIGTERM, orig_sigterm_handler)
+                signal.signal(signal.SIGINT, orig_sigint_handler)
+                if thread:
+                    thread.join()  # wait for the reader thread
+        logging.info("Process %s exited with return code %s", proc.pid, proc.returncode)
         if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
             raise QueryScriptCancelError(
                 "Query script was canceled by user",
@@ -1626,17 +1591,14 @@ class Catalog:
         force: bool = False,
         update: bool = False,
         recursive: bool = False,
-        edatachain_file: Optional[str] = None,
-        edatachain_only: bool = False,
-        no_edatachain_file: bool = False,
+        no_cp: bool = False,
         no_glob: bool = False,
         *,
-        client_config=None,
-    ) -> list[dict[str, Any]]:
+        client_config: Optional["dict"] = None,
+    ) -> None:
         """
         This function copies files from cloud sources to local destination directory
         If cloud source is not indexed, or has expired index, it runs indexing
-        It also creates .edatachain file by default, if not specified differently
         """
         client_config = client_config or self.client_config
         node_groups = self.enlist_sources_grouped(
@@ -1647,17 +1609,11 @@ class Catalog:
         )
         always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
-            node_groups, output, force, edatachain_only, no_edatachain_file
-        )
-        dataset_file = check_output_dataset_file(
-            output, force, edatachain_file, no_edatachain_file
+            node_groups, output, force, no_cp
         )
         total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
-        if total_files == 0:
-            # Nothing selected to cp
-            return []
+        if not total_files:
+            return
         desc_max_len = max(len(output) + 16, 19)
         bar_format = (
@@ -1667,7 +1623,7 @@ class Catalog:
             "[{elapsed}<{remaining}, {rate_fmt:>8}]"
         )
-        if not edatachain_only:
+        if not no_cp:
             with get_download_bar(bar_format, total_size) as pbar:
                 for node_group in node_groups:
                     node_group.download(recursive=recursive, pbar=pbar)
@@ -1679,21 +1635,10 @@ class Catalog:
             total_files,
             force,
             recursive,
-            edatachain_only,
+            no_cp,
             always_copy_dir_contents,
             copy_to_filename,
         )
-        if no_edatachain_file:
-            return []
-        metafile_data = compute_metafile_data(node_groups)
-        if metafile_data:
-            # Don't write the metafile if nothing was copied
-            print(f"Creating '{dataset_file}'")
-            with open(dataset_file, "w", encoding="utf-8") as fd:
-                yaml.dump(metafile_data, fd, sort_keys=False)
-        return metafile_data
     def du(
         self,

datachain/cli/__init__.py CHANGED Viewed

@@ -11,7 +11,6 @@ from datachain.telemetry import telemetry
 from .commands import (
     clear_cache,
     completion,
-    dataset_stats,
     du,
     edit_dataset,
     garbage_collect,
@@ -47,9 +46,13 @@ def main(argv: Optional[list[str]] = None) -> int:
     logging_level = get_logging_level(args)
     logger.setLevel(logging_level)
-    client_config = {
-        "anon": args.anon,
-    }
+    client_config = (
+        {
+            "anon": args.anon,
+        }
+        if getattr(args, "anon", False)
+        else {}
+    )
     if args.debug_sql:
         # This also sets this environment variable for any subprocesses
@@ -107,9 +110,6 @@ def handle_cp_command(args, catalog):
         force=bool(args.force),
         update=bool(args.update),
         recursive=bool(args.recursive),
-        edatachain_file=None,
-        edatachain_only=False,
-        no_edatachain_file=True,
         no_glob=args.no_glob,
     )
@@ -181,13 +181,6 @@ def handle_dataset_command(args, catalog):
             all=args.all,
             team=args.team,
         ),
-        "stats": lambda: dataset_stats(
-            catalog,
-            args.name,
-            args.version,
-            show_bytes=args.bytes,
-            si=args.si,
-        ),
     }
     handler = dataset_commands.get(args.datasets_cmd)

datachain/cli/commands/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from .datasets import (
-    dataset_stats,
     edit_dataset,
     list_datasets,
     list_datasets_local,
@@ -15,7 +14,6 @@ from .show import show
 __all__ = [
     "clear_cache",
     "completion",
-    "dataset_stats",
     "du",
     "edit_dataset",
     "garbage_collect",

datachain/cli/commands/datasets.py CHANGED Viewed

@@ -3,8 +3,6 @@ from typing import TYPE_CHECKING, Optional
 from tabulate import tabulate
-from datachain import utils
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -109,20 +107,3 @@ def edit_dataset(
     if (all or studio) and token:
         edit_studio_dataset(team, name, new_name, description, labels)
-def dataset_stats(
-    catalog: "Catalog",
-    name: str,
-    version: int,
-    show_bytes=False,
-    si=False,
-):
-    stats = catalog.dataset_stats(name, version)
-    if stats:
-        print(f"Number of objects: {stats.num_objects}")
-        if show_bytes:
-            print(f"Total objects size: {stats.size}")
-        else:
-            print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")

datachain 0.8.10__py3-none-any.whl → 0.8.12__py3-none-any.whl

Potentially problematic release.

datachain 0.8.10py3-none-any.whl → 0.8.12py3-none-any.whl