PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,28 +1,16 @@
 import io
-import json
 import logging
 import os
 import os.path
 import posixpath
-import signal
-import subprocess
-import sys
 import time
 import traceback
-from collections.abc import Iterable, Iterator, Mapping, Sequence
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from contextlib import contextmanager, suppress
 from copy import copy
 from dataclasses import dataclass
 from functools import cached_property, reduce
-from threading import Thread
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    NoReturn,
-    Optional,
-    Union,
-)
+from typing import TYPE_CHECKING, Any
 from uuid import uuid4
 import sqlalchemy as sa
@@ -43,6 +31,7 @@ from datachain.dataset import (
     create_dataset_uri,
     parse_dataset_name,
     parse_dataset_uri,
+    parse_schema,
 )
 from datachain.error import (
     DataChainError,
@@ -51,8 +40,6 @@ from datachain.error import (
     DatasetVersionNotFoundError,
     NamespaceNotFoundError,
     ProjectNotFoundError,
-    QueryScriptCancelError,
-    QueryScriptRunError,
 )
 from datachain.lib.listing import get_listing
 from datachain.node import DirType, Node, NodeWithPath
@@ -62,12 +49,10 @@ from datachain.sql.types import DateTime, SQLType
 from datachain.utils import DataChainDir
 from .datasource import DataSource
+from .dependency import build_dependency_hierarchy, populate_nested_dependencies
 if TYPE_CHECKING:
-    from datachain.data_storage import (
-        AbstractMetastore,
-        AbstractWarehouse,
-    )
+    from datachain.data_storage import AbstractMetastore, AbstractWarehouse
     from datachain.dataset import DatasetListVersion
     from datachain.job import Job
     from datachain.lib.listing_info import ListingInfo
@@ -81,8 +66,6 @@ TTL_INT = 4 * 60 * 60
 INDEX_INTERNAL_ERROR_MESSAGE = "Internal error on indexing"
 DATASET_INTERNAL_ERROR_MESSAGE = "Internal error on creating dataset"
-# exit code we use if last statement in query script is not instance of DatasetQuery
-QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
 # exit code we use if query script was canceled
 QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
 QUERY_SCRIPT_SIGTERM_EXIT_CODE = -15  # if query script was terminated by SIGTERM
@@ -94,71 +77,11 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be av
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
-def noop(_: str):
-    pass
-class TerminationSignal(RuntimeError):  # noqa: N818
-    def __init__(self, signal):
-        self.signal = signal
-        super().__init__("Received termination signal", signal)
-    def __repr__(self):
-        return f"{self.__class__.__name__}({self.signal})"
-if sys.platform == "win32":
-    SIGINT = signal.CTRL_C_EVENT
-else:
-    SIGINT = signal.SIGINT
 def is_namespace_local(namespace_name) -> bool:
     """Checks if namespace is from local environment, i.e. is `local`"""
     return namespace_name == "local"
-def shutdown_process(
-    proc: subprocess.Popen,
-    interrupt_timeout: Optional[int] = None,
-    terminate_timeout: Optional[int] = None,
-) -> int:
-    """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
-    logger.info("sending interrupt signal to the process %s", proc.pid)
-    proc.send_signal(SIGINT)
-    logger.info("waiting for the process %s to finish", proc.pid)
-    try:
-        return proc.wait(interrupt_timeout)
-    except subprocess.TimeoutExpired:
-        logger.info(
-            "timed out waiting, sending terminate signal to the process %s", proc.pid
-        )
-        proc.terminate()
-        try:
-            return proc.wait(terminate_timeout)
-        except subprocess.TimeoutExpired:
-            logger.info("timed out waiting, killing the process %s", proc.pid)
-            proc.kill()
-            return proc.wait()
-def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
-    buffer = b""
-    while byt := stream.read(1):  # Read one byte at a time
-        buffer += byt
-        if byt in (b"\n", b"\r"):  # Check for newline or carriage return
-            line = buffer.decode("utf-8")
-            callback(line)
-            buffer = b""  # Clear buffer for next line
-    if buffer:  # Handle any remaining data in the buffer
-        line = buffer.decode("utf-8")
-        callback(line)
 class DatasetRowsFetcher(NodesThreadPool):
     def __init__(
         self,
@@ -168,7 +91,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         remote_ds_version: str,
         local_ds: DatasetRecord,
         local_ds_version: str,
-        schema: dict[str, Union[SQLType, type[SQLType]]],
+        schema: dict[str, SQLType | type[SQLType]],
         max_threads: int = PULL_DATASET_MAX_THREADS,
         progress_bar=None,
     ):
@@ -183,7 +106,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         self.local_ds = local_ds
         self.local_ds_version = local_ds_version
         self.schema = schema
-        self.last_status_check: Optional[float] = None
+        self.last_status_check: float | None = None
         self.studio_client = StudioClient()
         self.progress_bar = progress_bar
@@ -287,16 +210,16 @@ class DatasetRowsFetcher(NodesThreadPool):
 class NodeGroup:
     """Class for a group of nodes from the same source"""
-    listing: Optional["Listing"]
-    client: "Client"
+    listing: "Listing | None"
+    client: Client
     sources: list[DataSource]
     # The source path within the bucket
     # (not including the bucket name or s3:// prefix)
     source_path: str = ""
-    dataset_name: Optional[str] = None
-    dataset_version: Optional[str] = None
-    instantiated_nodes: Optional[list[NodeWithPath]] = None
+    dataset_name: str | None = None
+    dataset_version: str | None = None
+    instantiated_nodes: list[NodeWithPath] | None = None
     @property
     def is_dataset(self) -> bool:
@@ -317,13 +240,23 @@ class NodeGroup:
         if self.sources:
             self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
+    def close(self) -> None:
+        if self.listing:
+            self.listing.close()
+    def __enter__(self) -> "NodeGroup":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
 def prepare_output_for_cp(
     node_groups: list[NodeGroup],
     output: str,
     force: bool = False,
     no_cp: bool = False,
-) -> tuple[bool, Optional[str]]:
+) -> tuple[bool, str | None]:
     total_node_count = 0
     for node_group in node_groups:
         if not node_group.sources:
@@ -372,7 +305,7 @@ def collect_nodes_for_cp(
     # Collect all sources to process
     for node_group in node_groups:
-        listing: Optional[Listing] = node_group.listing
+        listing: Listing | None = node_group.listing
         valid_sources: list[DataSource] = []
         for dsrc in node_group.sources:
             if dsrc.is_single_object():
@@ -416,7 +349,7 @@ def instantiate_node_groups(
     recursive: bool = False,
     virtual_only: bool = False,
     always_copy_dir_contents: bool = False,
-    copy_to_filename: Optional[str] = None,
+    copy_to_filename: str | None = None,
 ) -> None:
     instantiate_progress_bar = (
         None
@@ -444,7 +377,7 @@ def instantiate_node_groups(
     for node_group in node_groups:
         if not node_group.sources:
             continue
-        listing: Optional[Listing] = node_group.listing
+        listing: Listing | None = node_group.listing
         source_path: str = node_group.source_path
         copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
@@ -527,10 +460,8 @@ class Catalog:
         warehouse: "AbstractWarehouse",
         cache_dir=None,
         tmp_dir=None,
-        client_config: Optional[dict[str, Any]] = None,
-        warehouse_ready_callback: Optional[
-            Callable[["AbstractWarehouse"], None]
-        ] = None,
+        client_config: dict[str, Any] | None = None,
+        warehouse_ready_callback: Callable[["AbstractWarehouse"], None] | None = None,
         in_memory: bool = False,
     ):
         datachain_dir = DataChainDir(cache=cache_dir, tmp=tmp_dir)
@@ -545,6 +476,7 @@ class Catalog:
         }
         self._warehouse_ready_callback = warehouse_ready_callback
         self.in_memory = in_memory
+        self._owns_connections = True  # False for copies, prevents double-close
     @cached_property
     def warehouse(self) -> "AbstractWarehouse":
@@ -566,13 +498,36 @@ class Catalog:
         }
     def copy(self, cache=True, db=True):
+        """
+        Create a shallow copy of this catalog.
+        The copy shares metastore and warehouse with the original but will not
+        close them - only the original catalog owns the connections.
+        """
         result = copy(self)
+        result._owns_connections = False
         if not db:
             result.metastore = None
             result._warehouse = None
             result.warehouse = None
         return result
+    def close(self) -> None:
+        if not self._owns_connections:
+            return
+        if self.metastore is not None:
+            with suppress(Exception):
+                self.metastore.close_on_exit()
+        if self._warehouse is not None:
+            with suppress(Exception):
+                self._warehouse.close_on_exit()
+    def __enter__(self) -> "Catalog":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
     @classmethod
     def generate_query_dataset_name(cls) -> str:
         return f"{QUERY_DATASET_PREFIX}_{uuid4().hex}"
@@ -592,7 +547,7 @@ class Catalog:
         client_config=None,
         column="file",
         skip_indexing=False,
-    ) -> tuple[Optional["Listing"], "Client", str]:
+    ) -> tuple["Listing | None", Client, str]:
         from datachain import read_storage
         from datachain.listing import Listing
@@ -626,6 +581,7 @@ class Catalog:
             **kwargs,
         )
+    @contextmanager
     def enlist_sources(
         self,
         sources: list[str],
@@ -633,34 +589,41 @@ class Catalog:
         skip_indexing=False,
         client_config=None,
         only_index=False,
-    ) -> Optional[list["DataSource"]]:
-        enlisted_sources = []
-        for src in sources:  # Opt: parallel
-            listing, client, file_path = self.enlist_source(
-                src,
-                update,
-                client_config=client_config or self.client_config,
-                skip_indexing=skip_indexing,
-            )
-            enlisted_sources.append((listing, client, file_path))
-        if only_index:
-            # sometimes we don't really need listing result (e.g on indexing process)
-            # so this is to improve performance
-            return None
-        dsrc_all: list[DataSource] = []
-        for listing, client, file_path in enlisted_sources:
-            if not listing:
-                nodes = [Node.from_file(client.get_file_info(file_path))]
-                dir_only = False
-            else:
-                nodes = listing.expand_path(file_path)
-                dir_only = file_path.endswith("/")
-            dsrc_all.extend(
-                DataSource(listing, client, node, dir_only) for node in nodes
-            )
-        return dsrc_all
+    ) -> Iterator[list["DataSource"] | None]:
+        enlisted_sources: list[tuple[Listing | None, Client, str]] = []
+        try:
+            for src in sources:  # Opt: parallel
+                listing, client, file_path = self.enlist_source(
+                    src,
+                    update,
+                    client_config=client_config or self.client_config,
+                    skip_indexing=skip_indexing,
+                )
+                enlisted_sources.append((listing, client, file_path))
+            if only_index:
+                # sometimes we don't really need listing result (e.g. on indexing
+                # process) so this is to improve performance
+                yield None
+                return
+            dsrc_all: list[DataSource] = []
+            for listing, client, file_path in enlisted_sources:
+                if not listing:
+                    nodes = [Node.from_file(client.get_file_info(file_path))]
+                    dir_only = False
+                else:
+                    nodes = listing.expand_path(file_path)
+                    dir_only = file_path.endswith("/")
+                dsrc_all.extend(
+                    DataSource(listing, client, node, dir_only) for node in nodes
+                )
+            yield dsrc_all
+        finally:
+            for listing, _, _ in enlisted_sources:
+                if listing:
+                    with suppress(Exception):
+                        listing.close()
     def enlist_sources_grouped(
         self,
@@ -679,7 +642,7 @@ class Catalog:
         enlisted_sources: list[tuple[bool, bool, Any]] = []
         client_config = client_config or self.client_config
         for src in sources:  # Opt: parallel
-            listing: Optional[Listing]
+            listing: Listing | None
             if src.startswith("ds://"):
                 ds_name, ds_version = parse_dataset_uri(src)
                 ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
@@ -785,19 +748,20 @@ class Catalog:
     def create_dataset(
         self,
         name: str,
-        project: Optional[Project] = None,
-        version: Optional[str] = None,
+        project: Project | None = None,
+        version: str | None = None,
         *,
         columns: Sequence[Column],
-        feature_schema: Optional[dict] = None,
+        feature_schema: dict | None = None,
         query_script: str = "",
-        create_rows: Optional[bool] = True,
-        validate_version: Optional[bool] = True,
-        listing: Optional[bool] = False,
-        uuid: Optional[str] = None,
-        description: Optional[str] = None,
-        attrs: Optional[list[str]] = None,
-        update_version: Optional[str] = "patch",
+        create_rows: bool | None = True,
+        validate_version: bool | None = True,
+        listing: bool | None = False,
+        uuid: str | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
+        update_version: str | None = "patch",
+        job_id: str | None = None,
     ) -> "DatasetRecord":
         """
         Creates new dataset of a specific version.
@@ -863,7 +827,7 @@ class Catalog:
                 f"Version {version} must be higher than the current latest one"
             )
-        return self.create_new_dataset_version(
+        return self.create_dataset_version(
             dataset,
             version,
             feature_schema=feature_schema,
@@ -871,9 +835,10 @@ class Catalog:
             create_rows_table=create_rows,
             columns=columns,
             uuid=uuid,
+            job_id=job_id,
         )
-    def create_new_dataset_version(
+    def create_dataset_version(
         self,
         dataset: DatasetRecord,
         version: str,
@@ -886,8 +851,8 @@ class Catalog:
         error_stack="",
         script_output="",
         create_rows_table=True,
-        job_id: Optional[str] = None,
-        uuid: Optional[str] = None,
+        job_id: str | None = None,
+        uuid: str | None = None,
     ) -> DatasetRecord:
         """
         Creates dataset version if it doesn't exist.
@@ -901,7 +866,7 @@ class Catalog:
         dataset = self.metastore.create_dataset_version(
             dataset,
             version,
-            status=DatasetStatus.PENDING,
+            status=DatasetStatus.CREATED,
             sources=sources,
             feature_schema=feature_schema,
             query_script=query_script,
@@ -971,7 +936,7 @@ class Catalog:
         return dataset_updated
     def remove_dataset_version(
-        self, dataset: DatasetRecord, version: str, drop_rows: Optional[bool] = True
+        self, dataset: DatasetRecord, version: str, drop_rows: bool | None = True
     ) -> None:
         """
         Deletes one single dataset version.
@@ -999,7 +964,7 @@ class Catalog:
         self,
         name: str,
         sources: list[str],
-        project: Optional[Project] = None,
+        project: Project | None = None,
         client_config=None,
         recursive=False,
     ) -> DatasetRecord:
@@ -1068,8 +1033,8 @@ class Catalog:
     def get_full_dataset_name(
         self,
         name: str,
-        project_name: Optional[str] = None,
-        namespace_name: Optional[str] = None,
+        project_name: str | None = None,
+        namespace_name: str | None = None,
     ) -> tuple[str, str, str]:
         """
         Returns dataset name together with separated namespace and project name.
@@ -1101,8 +1066,8 @@ class Catalog:
     def get_dataset(
         self,
         name: str,
-        namespace_name: Optional[str] = None,
-        project_name: Optional[str] = None,
+        namespace_name: str | None = None,
+        project_name: str | None = None,
     ) -> DatasetRecord:
         from datachain.lib.listing import is_listing_dataset
@@ -1122,7 +1087,7 @@ class Catalog:
         name: str,
         namespace_name: str,
         project_name: str,
-        version: Optional[str] = None,
+        version: str | None = None,
         pull_dataset: bool = False,
         update: bool = False,
     ) -> DatasetRecord:
@@ -1209,49 +1174,73 @@ class Catalog:
         assert isinstance(dataset_info, dict)
         return DatasetRecord.from_dict(dataset_info)
+    def get_dataset_dependencies_by_ids(
+        self,
+        dataset_id: int,
+        version_id: int,
+        indirect: bool = True,
+    ) -> list[DatasetDependency | None]:
+        dependency_nodes = self.metastore.get_dataset_dependency_nodes(
+            dataset_id=dataset_id,
+            version_id=version_id,
+        )
+        if not dependency_nodes:
+            return []
+        dependency_map, children_map = build_dependency_hierarchy(dependency_nodes)
+        root_key = (dataset_id, version_id)
+        if root_key not in children_map:
+            return []
+        root_dependency_ids = children_map[root_key]
+        root_dependencies = [dependency_map[dep_id] for dep_id in root_dependency_ids]
+        if indirect:
+            for dependency in root_dependencies:
+                if dependency is not None:
+                    populate_nested_dependencies(
+                        dependency, dependency_nodes, dependency_map, children_map
+                    )
+        return root_dependencies
     def get_dataset_dependencies(
         self,
         name: str,
         version: str,
-        namespace_name: Optional[str] = None,
-        project_name: Optional[str] = None,
+        namespace_name: str | None = None,
+        project_name: str | None = None,
         indirect=False,
-    ) -> list[Optional[DatasetDependency]]:
+    ) -> list[DatasetDependency | None]:
         dataset = self.get_dataset(
             name,
             namespace_name=namespace_name,
             project_name=project_name,
         )
-        direct_dependencies = self.metastore.get_direct_dataset_dependencies(
-            dataset, version
-        )
+        dataset_version = dataset.get_version(version)
+        dataset_id = dataset.id
+        dataset_version_id = dataset_version.id
         if not indirect:
-            return direct_dependencies
-        for d in direct_dependencies:
-            if not d:
-                # dependency has been removed
-                continue
-            if d.is_dataset:
-                # only datasets can have dependencies
-                d.dependencies = self.get_dataset_dependencies(
-                    d.name,
-                    d.version,
-                    namespace_name=d.namespace,
-                    project_name=d.project,
-                    indirect=indirect,
-                )
+            return self.metastore.get_direct_dataset_dependencies(
+                dataset,
+                version,
+            )
-        return direct_dependencies
+        return self.get_dataset_dependencies_by_ids(
+            dataset_id,
+            dataset_version_id,
+            indirect,
+        )
     def ls_datasets(
         self,
-        prefix: Optional[str] = None,
+        prefix: str | None = None,
         include_listing: bool = False,
         studio: bool = False,
-        project: Optional[Project] = None,
+        project: Project | None = None,
     ) -> Iterator[DatasetListRecord]:
         from datachain.remote.studio import StudioClient
@@ -1283,12 +1272,12 @@ class Catalog:
     def list_datasets_versions(
         self,
-        prefix: Optional[str] = None,
+        prefix: str | None = None,
         include_listing: bool = False,
         with_job: bool = True,
         studio: bool = False,
-        project: Optional[Project] = None,
-    ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
+        project: Project | None = None,
+    ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", "Job | None"]]:
         """Iterate over all dataset versions with related jobs."""
         datasets = list(
             self.ls_datasets(
@@ -1316,7 +1305,7 @@ class Catalog:
                 for v in d.versions
             )
-    def listings(self, prefix: Optional[str] = None) -> list["ListingInfo"]:
+    def listings(self, prefix: str | None = None) -> list["ListingInfo"]:
         """
         Returns list of ListingInfo objects which are representing specific
         storage listing datasets
@@ -1367,9 +1356,9 @@ class Catalog:
         self,
         source: str,
         path: str,
-        version_id: Optional[str] = None,
+        version_id: str | None = None,
         client_config=None,
-        content_disposition: Optional[str] = None,
+        content_disposition: str | None = None,
         **kwargs,
     ) -> str:
         client_config = client_config or self.client_config
@@ -1388,7 +1377,7 @@ class Catalog:
         bucket_uri: str,
         name: str,
         version: str,
-        project: Optional[Project] = None,
+        project: Project | None = None,
         client_config=None,
     ) -> list[str]:
         dataset = self.get_dataset(
@@ -1402,7 +1391,7 @@ class Catalog:
         )
     def dataset_table_export_file_names(
-        self, name: str, version: str, project: Optional[Project] = None
+        self, name: str, version: str, project: Project | None = None
     ) -> list[str]:
         dataset = self.get_dataset(
             name,
@@ -1414,9 +1403,9 @@ class Catalog:
     def remove_dataset(
         self,
         name: str,
-        project: Optional[Project] = None,
-        version: Optional[str] = None,
-        force: Optional[bool] = False,
+        project: Project | None = None,
+        version: str | None = None,
+        force: bool | None = False,
     ):
         dataset = self.get_dataset(
             name,
@@ -1444,10 +1433,10 @@ class Catalog:
     def edit_dataset(
         self,
         name: str,
-        project: Optional[Project] = None,
-        new_name: Optional[str] = None,
-        description: Optional[str] = None,
-        attrs: Optional[list[str]] = None,
+        project: Project | None = None,
+        new_name: str | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
     ) -> DatasetRecord:
         update_data = {}
         if new_name:
@@ -1474,22 +1463,24 @@ class Catalog:
         *,
         client_config=None,
     ) -> Iterator[tuple[DataSource, Iterable[tuple]]]:
-        data_sources = self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             skip_indexing=skip_indexing,
             client_config=client_config or self.client_config,
-        )
+        ) as data_sources:
+            if data_sources is None:
+                return
-        for source in data_sources:  # type: ignore [union-attr]
-            yield source, source.ls(fields)
+            for source in data_sources:
+                yield source, source.ls(fields)
     def pull_dataset(  # noqa: C901, PLR0915
         self,
         remote_ds_uri: str,
-        output: Optional[str] = None,
-        local_ds_name: Optional[str] = None,
-        local_ds_version: Optional[str] = None,
+        output: str | None = None,
+        local_ds_name: str | None = None,
+        local_ds_version: str | None = None,
         cp: bool = False,
         force: bool = False,
         *,
@@ -1636,7 +1627,7 @@ class Catalog:
             leave=False,
         )
-        schema = DatasetRecord.parse_schema(remote_ds_version.schema)
+        schema = parse_schema(remote_ds_version.schema)
         local_ds = self.create_dataset(
             local_ds_name,
@@ -1746,11 +1737,12 @@ class Catalog:
         else:
             # since we don't call cp command, which does listing implicitly,
             # it needs to be done here
-            self.enlist_sources(
+            with self.enlist_sources(
                 sources,
                 update,
                 client_config=client_config or self.client_config,
-            )
+            ):
+                pass
         self.create_dataset_from_sources(
             output,
@@ -1760,86 +1752,6 @@ class Catalog:
             recursive=recursive,
         )
-    def query(
-        self,
-        query_script: str,
-        env: Optional[Mapping[str, str]] = None,
-        python_executable: str = sys.executable,
-        capture_output: bool = False,
-        output_hook: Callable[[str], None] = noop,
-        params: Optional[dict[str, str]] = None,
-        job_id: Optional[str] = None,
-        interrupt_timeout: Optional[int] = None,
-        terminate_timeout: Optional[int] = None,
-    ) -> None:
-        cmd = [python_executable, "-c", query_script]
-        env = dict(env or os.environ)
-        env.update(
-            {
-                "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
-                "DATACHAIN_JOB_ID": job_id or "",
-            },
-        )
-        popen_kwargs: dict[str, Any] = {}
-        if capture_output:
-            popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
-        def raise_termination_signal(sig: int, _: Any) -> NoReturn:
-            raise TerminationSignal(sig)
-        thread: Optional[Thread] = None
-        with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # noqa: S603
-            logger.info("Starting process %s", proc.pid)
-            orig_sigint_handler = signal.getsignal(signal.SIGINT)
-            # ignore SIGINT in the main process.
-            # In the terminal, SIGINTs are received by all the processes in
-            # the foreground process group, so the script will receive the signal too.
-            # (If we forward the signal to the child, it will receive it twice.)
-            signal.signal(signal.SIGINT, signal.SIG_IGN)
-            orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
-            signal.signal(signal.SIGTERM, raise_termination_signal)
-            try:
-                if capture_output:
-                    args = (proc.stdout, output_hook)
-                    thread = Thread(target=_process_stream, args=args, daemon=True)
-                    thread.start()
-                proc.wait()
-            except TerminationSignal as exc:
-                signal.signal(signal.SIGTERM, orig_sigterm_handler)
-                signal.signal(signal.SIGINT, orig_sigint_handler)
-                logger.info("Shutting down process %s, received %r", proc.pid, exc)
-                # Rather than forwarding the signal to the child, we try to shut it down
-                # gracefully. This is because we consider the script to be interactive
-                # and special, so we give it time to cleanup before exiting.
-                shutdown_process(proc, interrupt_timeout, terminate_timeout)
-                if proc.returncode:
-                    raise QueryScriptCancelError(
-                        "Query script was canceled by user", return_code=proc.returncode
-                    ) from exc
-            finally:
-                signal.signal(signal.SIGTERM, orig_sigterm_handler)
-                signal.signal(signal.SIGINT, orig_sigint_handler)
-                if thread:
-                    thread.join()  # wait for the reader thread
-        logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
-        if proc.returncode in (
-            QUERY_SCRIPT_CANCELED_EXIT_CODE,
-            QUERY_SCRIPT_SIGTERM_EXIT_CODE,
-        ):
-            raise QueryScriptCancelError(
-                "Query script was canceled by user",
-                return_code=proc.returncode,
-            )
-        if proc.returncode:
-            raise QueryScriptRunError(
-                f"Query script exited with error code {proc.returncode}",
-                return_code=proc.returncode,
-            )
     def cp(
         self,
         sources: list[str],
@@ -1850,7 +1762,7 @@ class Catalog:
         no_cp: bool = False,
         no_glob: bool = False,
         *,
-        client_config: Optional["dict"] = None,
+        client_config: dict | None = None,
     ) -> None:
         """
         This function copies files from cloud sources to local destination directory
@@ -1863,38 +1775,42 @@ class Catalog:
             no_glob,
             client_config=client_config,
         )
+        try:
+            always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
+                node_groups, output, force, no_cp
+            )
+            total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
+            if not total_files:
+                return
-        always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
-            node_groups, output, force, no_cp
-        )
-        total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
-        if not total_files:
-            return
-        desc_max_len = max(len(output) + 16, 19)
-        bar_format = (
-            "{desc:<"
-            f"{desc_max_len}"
-            "}{percentage:3.0f}%|{bar}| {n_fmt:>5}/{total_fmt:<5} "
-            "[{elapsed}<{remaining}, {rate_fmt:>8}]"
-        )
+            desc_max_len = max(len(output) + 16, 19)
+            bar_format = (
+                "{desc:<"
+                f"{desc_max_len}"
+                "}{percentage:3.0f}%|{bar}| {n_fmt:>5}/{total_fmt:<5} "
+                "[{elapsed}<{remaining}, {rate_fmt:>8}]"
+            )
-        if not no_cp:
-            with get_download_bar(bar_format, total_size) as pbar:
-                for node_group in node_groups:
-                    node_group.download(recursive=recursive, pbar=pbar)
+            if not no_cp:
+                with get_download_bar(bar_format, total_size) as pbar:
+                    for node_group in node_groups:
+                        node_group.download(recursive=recursive, pbar=pbar)
-        instantiate_node_groups(
-            node_groups,
-            output,
-            bar_format,
-            total_files,
-            force,
-            recursive,
-            no_cp,
-            always_copy_dir_contents,
-            copy_to_filename,
-        )
+            instantiate_node_groups(
+                node_groups,
+                output,
+                bar_format,
+                total_files,
+                force,
+                recursive,
+                no_cp,
+                always_copy_dir_contents,
+                copy_to_filename,
+            )
+        finally:
+            for node_group in node_groups:
+                with suppress(Exception):
+                    node_group.close()
     def du(
         self,
@@ -1904,24 +1820,26 @@ class Catalog:
         *,
         client_config=None,
     ) -> Iterable[tuple[str, float]]:
-        sources = self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             client_config=client_config or self.client_config,
-        )
+        ) as matched_sources:
+            if matched_sources is None:
+                return
-        def du_dirs(src, node, subdepth):
-            if subdepth > 0:
-                subdirs = src.listing.get_dirs_by_parent_path(node.path)
-                for sd in subdirs:
-                    yield from du_dirs(src, sd, subdepth - 1)
-            yield (
-                src.get_node_full_path(node),
-                src.listing.du(node)[0],
-            )
+            def du_dirs(src, node, subdepth):
+                if subdepth > 0:
+                    subdirs = src.listing.get_dirs_by_parent_path(node.path)
+                    for sd in subdirs:
+                        yield from du_dirs(src, sd, subdepth - 1)
+                yield (
+                    src.get_node_full_path(node),
+                    src.listing.du(node)[0],
+                )
-        for src in sources:
-            yield from du_dirs(src, src.node, depth)
+            for src in matched_sources:
+                yield from du_dirs(src, src.node, depth)
     def find(
         self,
@@ -1937,39 +1855,42 @@ class Catalog:
         *,
         client_config=None,
     ) -> Iterator[str]:
-        sources = self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             client_config=client_config or self.client_config,
-        )
-        if not columns:
-            columns = ["path"]
-        field_set = set()
-        for column in columns:
-            if column == "du":
-                field_set.add("dir_type")
-                field_set.add("size")
-                field_set.add("path")
-            elif column == "name":
-                field_set.add("path")
-            elif column == "path":
-                field_set.add("dir_type")
-                field_set.add("path")
-            elif column == "size":
-                field_set.add("size")
-            elif column == "type":
-                field_set.add("dir_type")
-        fields = list(field_set)
-        field_lookup = {f: i for i, f in enumerate(fields)}
-        for src in sources:
-            results = src.listing.find(
-                src.node, fields, names, inames, paths, ipaths, size, typ
-            )
-            for row in results:
-                yield "\t".join(
-                    find_column_to_str(row, field_lookup, src, column)
-                    for column in columns
+        ) as matched_sources:
+            if matched_sources is None:
+                return
+            if not columns:
+                columns = ["path"]
+            field_set = set()
+            for column in columns:
+                if column == "du":
+                    field_set.add("dir_type")
+                    field_set.add("size")
+                    field_set.add("path")
+                elif column == "name":
+                    field_set.add("path")
+                elif column == "path":
+                    field_set.add("dir_type")
+                    field_set.add("path")
+                elif column == "size":
+                    field_set.add("size")
+                elif column == "type":
+                    field_set.add("dir_type")
+            fields = list(field_set)
+            field_lookup = {f: i for i, f in enumerate(fields)}
+            for src in matched_sources:
+                results = src.listing.find(
+                    src.node, fields, names, inames, paths, ipaths, size, typ
                 )
+                for row in results:
+                    yield "\t".join(
+                        find_column_to_str(row, field_lookup, src, column)
+                        for column in columns
+                    )
     def index(
         self,
@@ -1978,9 +1899,10 @@ class Catalog:
         *,
         client_config=None,
     ) -> None:
-        self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             client_config=client_config or self.client_config,
             only_index=True,
-        )
+        ):
+            pass

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl