PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -1,28 +1,16 @@
 import io
-import json
 import logging
 import os
 import os.path
 import posixpath
-import signal
-import subprocess
-import sys
 import time
 import traceback
-from collections.abc import Iterable, Iterator, Mapping, Sequence
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from contextlib import contextmanager, suppress
 from copy import copy
 from dataclasses import dataclass
 from functools import cached_property, reduce
-from threading import Thread
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    NoReturn,
-    Optional,
-    Union,
-)
+from typing import TYPE_CHECKING, Any
 from uuid import uuid4
 import sqlalchemy as sa
@@ -33,6 +21,7 @@ from datachain.cache import Cache
 from datachain.client import Client
 from datachain.dataset import (
     DATASET_PREFIX,
+    DEFAULT_DATASET_VERSION,
     QUERY_DATASET_PREFIX,
     DatasetDependency,
     DatasetListRecord,
@@ -40,31 +29,33 @@ from datachain.dataset import (
     DatasetStatus,
     StorageURI,
     create_dataset_uri,
+    parse_dataset_name,
     parse_dataset_uri,
+    parse_schema,
 )
 from datachain.error import (
     DataChainError,
     DatasetInvalidVersionError,
     DatasetNotFoundError,
     DatasetVersionNotFoundError,
-    QueryScriptCancelError,
-    QueryScriptRunError,
+    NamespaceNotFoundError,
+    ProjectNotFoundError,
 )
 from datachain.lib.listing import get_listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
+from datachain.project import Project
 from datachain.sql.types import DateTime, SQLType
 from datachain.utils import DataChainDir
 from .datasource import DataSource
+from .dependency import build_dependency_hierarchy, populate_nested_dependencies
 if TYPE_CHECKING:
-    from datachain.data_storage import (
-        AbstractMetastore,
-        AbstractWarehouse,
-    )
+    from datachain.data_storage import AbstractMetastore, AbstractWarehouse
     from datachain.dataset import DatasetListVersion
     from datachain.job import Job
+    from datachain.lib.listing_info import ListingInfo
     from datachain.listing import Listing
 logger = logging.getLogger("datachain")
@@ -75,10 +66,9 @@ TTL_INT = 4 * 60 * 60
 INDEX_INTERNAL_ERROR_MESSAGE = "Internal error on indexing"
 DATASET_INTERNAL_ERROR_MESSAGE = "Internal error on creating dataset"
-# exit code we use if last statement in query script is not instance of DatasetQuery
-QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
 # exit code we use if query script was canceled
 QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
+QUERY_SCRIPT_SIGTERM_EXIT_CODE = -15  # if query script was terminated by SIGTERM
 # dataset pull
 PULL_DATASET_MAX_THREADS = 5
@@ -87,64 +77,9 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be av
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
-def noop(_: str):
-    pass
-class TerminationSignal(RuntimeError):  # noqa: N818
-    def __init__(self, signal):
-        self.signal = signal
-        super().__init__("Received termination signal", signal)
-    def __repr__(self):
-        return f"{self.__class__.__name__}({self.signal})"
-if sys.platform == "win32":
-    SIGINT = signal.CTRL_C_EVENT
-else:
-    SIGINT = signal.SIGINT
-def shutdown_process(
-    proc: subprocess.Popen,
-    interrupt_timeout: Optional[int] = None,
-    terminate_timeout: Optional[int] = None,
-) -> int:
-    """Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL."""
-    logger.info("sending interrupt signal to the process %s", proc.pid)
-    proc.send_signal(SIGINT)
-    logger.info("waiting for the process %s to finish", proc.pid)
-    try:
-        return proc.wait(interrupt_timeout)
-    except subprocess.TimeoutExpired:
-        logger.info(
-            "timed out waiting, sending terminate signal to the process %s", proc.pid
-        )
-        proc.terminate()
-        try:
-            return proc.wait(terminate_timeout)
-        except subprocess.TimeoutExpired:
-            logger.info("timed out waiting, killing the process %s", proc.pid)
-            proc.kill()
-            return proc.wait()
-def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
-    buffer = b""
-    while byt := stream.read(1):  # Read one byte at a time
-        buffer += byt
-        if byt in (b"\n", b"\r"):  # Check for newline or carriage return
-            line = buffer.decode("utf-8")
-            callback(line)
-            buffer = b""  # Clear buffer for next line
-    if buffer:  # Handle any remaining data in the buffer
-        line = buffer.decode("utf-8")
-        callback(line)
+def is_namespace_local(namespace_name) -> bool:
+    """Checks if namespace is from local environment, i.e. is `local`"""
+    return namespace_name == "local"
 class DatasetRowsFetcher(NodesThreadPool):
@@ -152,11 +87,11 @@ class DatasetRowsFetcher(NodesThreadPool):
         self,
         metastore: "AbstractMetastore",
         warehouse: "AbstractWarehouse",
-        remote_ds_name: str,
-        remote_ds_version: int,
-        local_ds_name: str,
-        local_ds_version: int,
-        schema: dict[str, Union[SQLType, type[SQLType]]],
+        remote_ds: DatasetRecord,
+        remote_ds_version: str,
+        local_ds: DatasetRecord,
+        local_ds_version: str,
+        schema: dict[str, SQLType | type[SQLType]],
         max_threads: int = PULL_DATASET_MAX_THREADS,
         progress_bar=None,
     ):
@@ -166,12 +101,12 @@ class DatasetRowsFetcher(NodesThreadPool):
         self._check_dependencies()
         self.metastore = metastore
         self.warehouse = warehouse
-        self.remote_ds_name = remote_ds_name
+        self.remote_ds = remote_ds
         self.remote_ds_version = remote_ds_version
-        self.local_ds_name = local_ds_name
+        self.local_ds = local_ds
         self.local_ds_version = local_ds_version
         self.schema = schema
-        self.last_status_check: Optional[float] = None
+        self.last_status_check: float | None = None
         self.studio_client = StudioClient()
         self.progress_bar = progress_bar
@@ -204,7 +139,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
         """
         export_status_response = self.studio_client.dataset_export_status(
-            self.remote_ds_name, self.remote_ds_version
+            self.remote_ds, self.remote_ds_version
         )
         if not export_status_response.ok:
             raise DataChainError(export_status_response.message)
@@ -251,9 +186,7 @@ class DatasetRowsFetcher(NodesThreadPool):
         import pandas as pd
         # metastore and warehouse are not thread safe
-        with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
-            local_ds = metastore.get_dataset(self.local_ds_name)
+        with self.warehouse.clone() as warehouse:
             urls = list(urls)
             for url in urls:
@@ -266,7 +199,7 @@ class DatasetRowsFetcher(NodesThreadPool):
                 df = self.fix_columns(df)
                 inserted = warehouse.insert_dataset_rows(
-                    df, local_ds, self.local_ds_version
+                    df, self.local_ds, self.local_ds_version
                 )
                 self.increase_counter(inserted)  # type: ignore [arg-type]
                 # sometimes progress bar doesn't get updated so manually updating it
@@ -277,16 +210,16 @@ class DatasetRowsFetcher(NodesThreadPool):
 class NodeGroup:
     """Class for a group of nodes from the same source"""
-    listing: Optional["Listing"]
-    client: "Client"
+    listing: "Listing | None"
+    client: Client
     sources: list[DataSource]
     # The source path within the bucket
     # (not including the bucket name or s3:// prefix)
     source_path: str = ""
-    dataset_name: Optional[str] = None
-    dataset_version: Optional[int] = None
-    instantiated_nodes: Optional[list[NodeWithPath]] = None
+    dataset_name: str | None = None
+    dataset_version: str | None = None
+    instantiated_nodes: list[NodeWithPath] | None = None
     @property
     def is_dataset(self) -> bool:
@@ -307,13 +240,23 @@ class NodeGroup:
         if self.sources:
             self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
+    def close(self) -> None:
+        if self.listing:
+            self.listing.close()
+    def __enter__(self) -> "NodeGroup":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
 def prepare_output_for_cp(
     node_groups: list[NodeGroup],
     output: str,
     force: bool = False,
     no_cp: bool = False,
-) -> tuple[bool, Optional[str]]:
+) -> tuple[bool, str | None]:
     total_node_count = 0
     for node_group in node_groups:
         if not node_group.sources:
@@ -362,7 +305,7 @@ def collect_nodes_for_cp(
     # Collect all sources to process
     for node_group in node_groups:
-        listing: Optional[Listing] = node_group.listing
+        listing: Listing | None = node_group.listing
         valid_sources: list[DataSource] = []
         for dsrc in node_group.sources:
             if dsrc.is_single_object():
@@ -406,7 +349,7 @@ def instantiate_node_groups(
     recursive: bool = False,
     virtual_only: bool = False,
     always_copy_dir_contents: bool = False,
-    copy_to_filename: Optional[str] = None,
+    copy_to_filename: str | None = None,
 ) -> None:
     instantiate_progress_bar = (
         None
@@ -434,7 +377,7 @@ def instantiate_node_groups(
     for node_group in node_groups:
         if not node_group.sources:
             continue
-        listing: Optional[Listing] = node_group.listing
+        listing: Listing | None = node_group.listing
         source_path: str = node_group.source_path
         copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
@@ -517,10 +460,8 @@ class Catalog:
         warehouse: "AbstractWarehouse",
         cache_dir=None,
         tmp_dir=None,
-        client_config: Optional[dict[str, Any]] = None,
-        warehouse_ready_callback: Optional[
-            Callable[["AbstractWarehouse"], None]
-        ] = None,
+        client_config: dict[str, Any] | None = None,
+        warehouse_ready_callback: Callable[["AbstractWarehouse"], None] | None = None,
         in_memory: bool = False,
     ):
         datachain_dir = DataChainDir(cache=cache_dir, tmp=tmp_dir)
@@ -535,6 +476,7 @@ class Catalog:
         }
         self._warehouse_ready_callback = warehouse_ready_callback
         self.in_memory = in_memory
+        self._owns_connections = True  # False for copies, prevents double-close
     @cached_property
     def warehouse(self) -> "AbstractWarehouse":
@@ -556,13 +498,36 @@ class Catalog:
         }
     def copy(self, cache=True, db=True):
+        """
+        Create a shallow copy of this catalog.
+        The copy shares metastore and warehouse with the original but will not
+        close them - only the original catalog owns the connections.
+        """
         result = copy(self)
+        result._owns_connections = False
         if not db:
             result.metastore = None
             result._warehouse = None
             result.warehouse = None
         return result
+    def close(self) -> None:
+        if not self._owns_connections:
+            return
+        if self.metastore is not None:
+            with suppress(Exception):
+                self.metastore.close_on_exit()
+        if self._warehouse is not None:
+            with suppress(Exception):
+                self._warehouse.close_on_exit()
+    def __enter__(self) -> "Catalog":
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
     @classmethod
     def generate_query_dataset_name(cls) -> str:
         return f"{QUERY_DATASET_PREFIX}_{uuid4().hex}"
@@ -580,15 +545,13 @@ class Catalog:
         source: str,
         update=False,
         client_config=None,
-        object_name="file",
+        column="file",
         skip_indexing=False,
-    ) -> tuple[Optional["Listing"], "Client", str]:
+    ) -> tuple["Listing | None", Client, str]:
         from datachain import read_storage
         from datachain.listing import Listing
-        read_storage(
-            source, session=self.session, update=update, object_name=object_name
-        ).exec()
+        read_storage(source, session=self.session, update=update, column=column).exec()
         list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update
@@ -602,13 +565,13 @@ class Catalog:
                 self.warehouse.clone(),
                 client,
                 dataset_name=list_ds_name,
-                object_name=object_name,
+                column=column,
             )
         return lst, client, list_path
     def _remove_dataset_rows_and_warehouse_info(
-        self, dataset: DatasetRecord, version: int, **kwargs
+        self, dataset: DatasetRecord, version: str, **kwargs
     ):
         self.warehouse.drop_dataset_rows_table(dataset, version)
         self.update_dataset_version_with_warehouse_info(
@@ -618,6 +581,7 @@ class Catalog:
             **kwargs,
         )
+    @contextmanager
     def enlist_sources(
         self,
         sources: list[str],
@@ -625,34 +589,41 @@ class Catalog:
         skip_indexing=False,
         client_config=None,
         only_index=False,
-    ) -> Optional[list["DataSource"]]:
-        enlisted_sources = []
-        for src in sources:  # Opt: parallel
-            listing, client, file_path = self.enlist_source(
-                src,
-                update,
-                client_config=client_config or self.client_config,
-                skip_indexing=skip_indexing,
-            )
-            enlisted_sources.append((listing, client, file_path))
-        if only_index:
-            # sometimes we don't really need listing result (e.g on indexing process)
-            # so this is to improve performance
-            return None
-        dsrc_all: list[DataSource] = []
-        for listing, client, file_path in enlisted_sources:
-            if not listing:
-                nodes = [Node.from_file(client.get_file_info(file_path))]
-                dir_only = False
-            else:
-                nodes = listing.expand_path(file_path)
-                dir_only = file_path.endswith("/")
-            dsrc_all.extend(
-                DataSource(listing, client, node, dir_only) for node in nodes
-            )
-        return dsrc_all
+    ) -> Iterator[list["DataSource"] | None]:
+        enlisted_sources: list[tuple[Listing | None, Client, str]] = []
+        try:
+            for src in sources:  # Opt: parallel
+                listing, client, file_path = self.enlist_source(
+                    src,
+                    update,
+                    client_config=client_config or self.client_config,
+                    skip_indexing=skip_indexing,
+                )
+                enlisted_sources.append((listing, client, file_path))
+            if only_index:
+                # sometimes we don't really need listing result (e.g. on indexing
+                # process) so this is to improve performance
+                yield None
+                return
+            dsrc_all: list[DataSource] = []
+            for listing, client, file_path in enlisted_sources:
+                if not listing:
+                    nodes = [Node.from_file(client.get_file_info(file_path))]
+                    dir_only = False
+                else:
+                    nodes = listing.expand_path(file_path)
+                    dir_only = file_path.endswith("/")
+                dsrc_all.extend(
+                    DataSource(listing, client, node, dir_only) for node in nodes
+                )
+            yield dsrc_all
+        finally:
+            for listing, _, _ in enlisted_sources:
+                if listing:
+                    with suppress(Exception):
+                        listing.close()
     def enlist_sources_grouped(
         self,
@@ -671,10 +642,15 @@ class Catalog:
         enlisted_sources: list[tuple[bool, bool, Any]] = []
         client_config = client_config or self.client_config
         for src in sources:  # Opt: parallel
-            listing: Optional[Listing]
+            listing: Listing | None
             if src.startswith("ds://"):
                 ds_name, ds_version = parse_dataset_uri(src)
-                dataset = self.get_dataset(ds_name)
+                ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
+                assert ds_namespace
+                assert ds_project
+                dataset = self.get_dataset(
+                    ds_name, namespace_name=ds_namespace, project_name=ds_project
+                )
                 if not ds_version:
                     ds_version = dataset.latest_version
                 dataset_sources = self.warehouse.get_dataset_sources(
@@ -694,7 +670,11 @@ class Catalog:
                         dataset_name=dataset_name,
                     )
                     rows = DatasetQuery(
-                        name=dataset.name, version=ds_version, catalog=self
+                        name=dataset.name,
+                        namespace_name=dataset.project.namespace.name,
+                        project_name=dataset.project.name,
+                        version=ds_version,
+                        catalog=self,
                     ).to_db_records()
                     indexed_sources.append(
                         (
@@ -768,44 +748,56 @@ class Catalog:
     def create_dataset(
         self,
         name: str,
-        version: Optional[int] = None,
+        project: Project | None = None,
+        version: str | None = None,
         *,
         columns: Sequence[Column],
-        feature_schema: Optional[dict] = None,
+        feature_schema: dict | None = None,
         query_script: str = "",
-        create_rows: Optional[bool] = True,
-        validate_version: Optional[bool] = True,
-        listing: Optional[bool] = False,
-        uuid: Optional[str] = None,
-        description: Optional[str] = None,
-        labels: Optional[list[str]] = None,
+        create_rows: bool | None = True,
+        validate_version: bool | None = True,
+        listing: bool | None = False,
+        uuid: str | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
+        update_version: str | None = "patch",
+        job_id: str | None = None,
     ) -> "DatasetRecord":
         """
         Creates new dataset of a specific version.
         If dataset is not yet created, it will create it with version 1
         If version is None, then next unused version is created.
-        If version is given, then it must be an unused version number.
+        If version is given, then it must be an unused version.
         """
+        DatasetRecord.validate_name(name)
         assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
         if not listing and Client.is_data_source_uri(name):
             raise RuntimeError(
                 "Cannot create dataset that starts with source prefix, e.g s3://"
             )
-        default_version = 1
+        default_version = DEFAULT_DATASET_VERSION
         try:
-            dataset = self.get_dataset(name)
-            default_version = dataset.next_version
-            if (description or labels) and (
-                dataset.description != description or dataset.labels != labels
+            dataset = self.get_dataset(
+                name,
+                namespace_name=project.namespace.name if project else None,
+                project_name=project.name if project else None,
+            )
+            default_version = dataset.next_version_patch
+            if update_version == "major":
+                default_version = dataset.next_version_major
+            if update_version == "minor":
+                default_version = dataset.next_version_minor
+            if (description or attrs) and (
+                dataset.description != description or dataset.attrs != attrs
             ):
                 description = description or dataset.description
-                labels = labels or dataset.labels
+                attrs = attrs or dataset.attrs
                 self.update_dataset(
                     dataset,
                     description=description,
-                    labels=labels,
+                    attrs=attrs,
                 )
         except DatasetNotFoundError:
@@ -814,12 +806,13 @@ class Catalog:
             }
             dataset = self.metastore.create_dataset(
                 name,
+                project.id if project else None,
                 feature_schema=feature_schema,
                 query_script=query_script,
                 schema=schema,
                 ignore_if_exists=True,
                 description=description,
-                labels=labels,
+                attrs=attrs,
             )
         version = version or default_version
@@ -834,7 +827,7 @@ class Catalog:
                 f"Version {version} must be higher than the current latest one"
             )
-        return self.create_new_dataset_version(
+        return self.create_dataset_version(
             dataset,
             version,
             feature_schema=feature_schema,
@@ -842,12 +835,13 @@ class Catalog:
             create_rows_table=create_rows,
             columns=columns,
             uuid=uuid,
+            job_id=job_id,
         )
-    def create_new_dataset_version(
+    def create_dataset_version(
         self,
         dataset: DatasetRecord,
-        version: int,
+        version: str,
         *,
         columns: Sequence[Column],
         sources="",
@@ -857,8 +851,8 @@ class Catalog:
         error_stack="",
         script_output="",
         create_rows_table=True,
-        job_id: Optional[str] = None,
-        uuid: Optional[str] = None,
+        job_id: str | None = None,
+        uuid: str | None = None,
     ) -> DatasetRecord:
         """
         Creates dataset version if it doesn't exist.
@@ -872,7 +866,7 @@ class Catalog:
         dataset = self.metastore.create_dataset_version(
             dataset,
             version,
-            status=DatasetStatus.PENDING,
+            status=DatasetStatus.CREATED,
             sources=sources,
             feature_schema=feature_schema,
             query_script=query_script,
@@ -886,14 +880,14 @@ class Catalog:
         )
         if create_rows_table:
-            table_name = self.warehouse.dataset_table_name(dataset.name, version)
+            table_name = self.warehouse.dataset_table_name(dataset, version)
             self.warehouse.create_dataset_rows_table(table_name, columns=columns)
             self.update_dataset_version_with_warehouse_info(dataset, version)
         return dataset
     def update_dataset_version_with_warehouse_info(
-        self, dataset: DatasetRecord, version: int, rows_dropped=False, **kwargs
+        self, dataset: DatasetRecord, version: str, rows_dropped=False, **kwargs
     ) -> None:
         from datachain.query.dataset import DatasetQuery
@@ -905,11 +899,7 @@ class Catalog:
             values["num_objects"] = None
             values["size"] = None
             values["preview"] = None
-            self.metastore.update_dataset_version(
-                dataset,
-                version,
-                **values,
-            )
+            self.metastore.update_dataset_version(dataset, version, **values)
             return
         if not dataset_version.num_objects:
@@ -921,7 +911,13 @@ class Catalog:
         if not dataset_version.preview:
             values["preview"] = (
-                DatasetQuery(name=dataset.name, version=version, catalog=self)
+                DatasetQuery(
+                    name=dataset.name,
+                    namespace_name=dataset.project.namespace.name,
+                    project_name=dataset.project.name,
+                    version=version,
+                    catalog=self,
+                )
                 .limit(20)
                 .to_db_records()
             )
@@ -929,38 +925,18 @@ class Catalog:
         if not values:
             return
-        self.metastore.update_dataset_version(
-            dataset,
-            version,
-            **values,
-        )
+        self.metastore.update_dataset_version(dataset, version, **values)
     def update_dataset(
         self, dataset: DatasetRecord, conn=None, **kwargs
     ) -> DatasetRecord:
         """Updates dataset fields."""
-        old_name = None
-        new_name = None
-        if "name" in kwargs and kwargs["name"] != dataset.name:
-            old_name = dataset.name
-            new_name = kwargs["name"]
-        dataset = self.metastore.update_dataset(dataset, conn=conn, **kwargs)
-        if old_name and new_name:
-            # updating name must result in updating dataset table names as well
-            for version in [v.version for v in dataset.versions]:
-                self.warehouse.rename_dataset_table(
-                    old_name,
-                    new_name,
-                    old_version=version,
-                    new_version=version,
-                )
-        return dataset
+        dataset_updated = self.metastore.update_dataset(dataset, conn=conn, **kwargs)
+        self.warehouse.rename_dataset_tables(dataset, dataset_updated)
+        return dataset_updated
     def remove_dataset_version(
-        self, dataset: DatasetRecord, version: int, drop_rows: Optional[bool] = True
+        self, dataset: DatasetRecord, version: str, drop_rows: bool | None = True
     ) -> None:
         """
         Deletes one single dataset version.
@@ -988,6 +964,7 @@ class Catalog:
         self,
         name: str,
         sources: list[str],
+        project: Project | None = None,
         client_config=None,
         recursive=False,
     ) -> DatasetRecord:
@@ -996,6 +973,8 @@ class Catalog:
         from datachain import read_dataset, read_storage
+        project = project or self.metastore.default_project
         chains = []
         for source in sources:
             if source.startswith(DATASET_PREFIX):
@@ -1008,10 +987,15 @@ class Catalog:
         # create union of all dataset queries created from sources
         dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
         try:
+            dc = dc.settings(project=project.name, namespace=project.namespace.name)
             dc.save(name)
         except Exception as e:  # noqa: BLE001
             try:
-                ds = self.get_dataset(name)
+                ds = self.get_dataset(
+                    name,
+                    namespace_name=project.namespace.name,
+                    project_name=project.name,
+                )
                 self.metastore.update_dataset_status(
                     ds,
                     DatasetStatus.FAILED,
@@ -1028,7 +1012,11 @@ class Catalog:
             except DatasetNotFoundError:
                 raise e from None
-        ds = self.get_dataset(name)
+        ds = self.get_dataset(
+            name,
+            namespace_name=project.namespace.name,
+            project_name=project.name,
+        )
         self.update_dataset_version_with_warehouse_info(
             ds,
@@ -1036,159 +1024,231 @@ class Catalog:
             sources="\n".join(sources),
         )
-        return self.get_dataset(name)
+        return self.get_dataset(
+            name,
+            namespace_name=project.namespace.name,
+            project_name=project.name,
+        )
-    def register_dataset(
+    def get_full_dataset_name(
         self,
-        dataset: DatasetRecord,
-        version: int,
-        target_dataset: DatasetRecord,
-        target_version: Optional[int] = None,
-    ) -> DatasetRecord:
+        name: str,
+        project_name: str | None = None,
+        namespace_name: str | None = None,
+    ) -> tuple[str, str, str]:
         """
-        Registers dataset version of one dataset as dataset version of another
-        one (it can be new version of existing one).
-        It also removes original dataset version
+        Returns dataset name together with separated namespace and project name.
+        It takes into account all the ways namespace and project can be added.
         """
-        target_version = target_version or target_dataset.next_version
-        if not target_dataset.is_valid_next_version(target_version):
-            raise DatasetInvalidVersionError(
-                f"Version {target_version} must be higher than the current latest one"
-            )
-        dataset_version = dataset.get_version(version)
-        if not dataset_version:
-            raise DatasetVersionNotFoundError(
-                f"Dataset {dataset.name} does not have version {version}"
-            )
-        if not dataset_version.is_final_status():
-            raise ValueError("Cannot register dataset version in non final status")
-        # copy dataset version
-        target_dataset = self.metastore.create_dataset_version(
-            target_dataset,
-            target_version,
-            sources=dataset_version.sources,
-            status=dataset_version.status,
-            query_script=dataset_version.query_script,
-            error_message=dataset_version.error_message,
-            error_stack=dataset_version.error_stack,
-            script_output=dataset_version.script_output,
-            created_at=dataset_version.created_at,
-            finished_at=dataset_version.finished_at,
-            schema=dataset_version.serialized_schema,
-            num_objects=dataset_version.num_objects,
-            size=dataset_version.size,
-            preview=dataset_version.preview,
-            job_id=dataset_version.job_id,
-        )
-        # to avoid re-creating rows table, we are just renaming it for a new version
-        # of target dataset
-        self.warehouse.rename_dataset_table(
-            dataset.name,
-            target_dataset.name,
-            old_version=version,
-            new_version=target_version,
+        parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name)
+        namespace_env = os.environ.get("DATACHAIN_NAMESPACE")
+        project_env = os.environ.get("DATACHAIN_PROJECT")
+        if project_env and len(project_env.split(".")) == 2:
+            # we allow setting both namespace and project in DATACHAIN_PROJECT
+            namespace_env, project_env = project_env.split(".")
+        namespace_name = (
+            parsed_namespace_name
+            or namespace_name
+            or namespace_env
+            or self.metastore.default_namespace_name
         )
-        self.metastore.update_dataset_dependency_source(
-            dataset,
-            version,
-            new_source_dataset=target_dataset,
-            new_source_dataset_version=target_version,
+        project_name = (
+            parsed_project_name
+            or project_name
+            or project_env
+            or self.metastore.default_project_name
         )
-        if dataset.id == target_dataset.id:
-            # we are updating the same dataset so we need to refresh it to have newly
-            # added version in step before
-            dataset = self.get_dataset(dataset.name)
+        return namespace_name, project_name, name
+    def get_dataset(
+        self,
+        name: str,
+        namespace_name: str | None = None,
+        project_name: str | None = None,
+    ) -> DatasetRecord:
+        from datachain.lib.listing import is_listing_dataset
-        self.remove_dataset_version(dataset, version, drop_rows=False)
+        namespace_name = namespace_name or self.metastore.default_namespace_name
+        project_name = project_name or self.metastore.default_project_name
-        return self.get_dataset(target_dataset.name)
+        if is_listing_dataset(name):
+            namespace_name = self.metastore.system_namespace_name
+            project_name = self.metastore.listing_project_name
-    def get_dataset(self, name: str) -> DatasetRecord:
-        return self.metastore.get_dataset(name)
+        return self.metastore.get_dataset(
+            name, namespace_name=namespace_name, project_name=project_name
+        )
     def get_dataset_with_remote_fallback(
-        self, name: str, version: Optional[int] = None
+        self,
+        name: str,
+        namespace_name: str,
+        project_name: str,
+        version: str | None = None,
+        pull_dataset: bool = False,
+        update: bool = False,
     ) -> DatasetRecord:
-        try:
-            ds = self.get_dataset(name)
-            if version and not ds.has_version(version):
-                raise DatasetVersionNotFoundError(
-                    f"Dataset {name} does not have version {version}"
+        from datachain.lib.dc.utils import is_studio
+        # Intentionally ignore update flag is version is provided. Here only exact
+        # version can be provided and update then doesn't make sense.
+        # It corresponds to a query like this for example:
+        #
+        #    dc.read_dataset("some.remote.dataset", version="1.0.0", update=True)
+        if version:
+            update = False
+        # we don't do Studio fallback is script is already ran in Studio, or if we try
+        # to fetch dataset with local namespace as that one cannot
+        # exist in Studio in the first place
+        no_fallback = is_studio() or is_namespace_local(namespace_name)
+        if no_fallback or not update:
+            try:
+                ds = self.get_dataset(
+                    name,
+                    namespace_name=namespace_name,
+                    project_name=project_name,
                 )
-            return ds
+                if not version or ds.has_version(version):
+                    return ds
+            except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
+                pass
+        if no_fallback:
+            raise DatasetNotFoundError(
+                f"Dataset {name}"
+                + (f" version {version} " if version else " ")
+                + f"not found in namespace {namespace_name} and project {project_name}"
+            )
-        except (DatasetNotFoundError, DatasetVersionNotFoundError):
+        if pull_dataset:
             print("Dataset not found in local catalog, trying to get from studio")
-            remote_ds_uri = f"{DATASET_PREFIX}{name}"
-            if version:
-                remote_ds_uri += f"@v{version}"
+            remote_ds_uri = create_dataset_uri(
+                name, namespace_name, project_name, version
+            )
             self.pull_dataset(
                 remote_ds_uri=remote_ds_uri,
                 local_ds_name=name,
                 local_ds_version=version,
             )
-            return self.get_dataset(name)
+            return self.get_dataset(
+                name,
+                namespace_name=namespace_name,
+                project_name=project_name,
+            )
+        return self.get_remote_dataset(namespace_name, project_name, name)
     def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
         """Returns dataset that contains version with specific uuid"""
         for dataset in self.ls_datasets():
             if dataset.has_version_with_uuid(uuid):
-                return self.get_dataset(dataset.name)
+                return self.get_dataset(
+                    dataset.name,
+                    namespace_name=dataset.project.namespace.name,
+                    project_name=dataset.project.name,
+                )
         raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
-    def get_remote_dataset(self, name: str) -> DatasetRecord:
+    def get_remote_dataset(
+        self, namespace: str, project: str, name: str
+    ) -> DatasetRecord:
         from datachain.remote.studio import StudioClient
         studio_client = StudioClient()
-        info_response = studio_client.dataset_info(name)
+        info_response = studio_client.dataset_info(namespace, project, name)
         if not info_response.ok:
+            if info_response.status == 404:
+                raise DatasetNotFoundError(
+                    f"Dataset {namespace}.{project}.{name} not found"
+                )
             raise DataChainError(info_response.message)
         dataset_info = info_response.data
         assert isinstance(dataset_info, dict)
         return DatasetRecord.from_dict(dataset_info)
-    def get_dataset_dependencies(
-        self, name: str, version: int, indirect=False
-    ) -> list[Optional[DatasetDependency]]:
-        dataset = self.get_dataset(name)
+    def get_dataset_dependencies_by_ids(
+        self,
+        dataset_id: int,
+        version_id: int,
+        indirect: bool = True,
+    ) -> list[DatasetDependency | None]:
+        dependency_nodes = self.metastore.get_dataset_dependency_nodes(
+            dataset_id=dataset_id,
+            version_id=version_id,
+        )
+        if not dependency_nodes:
+            return []
+        dependency_map, children_map = build_dependency_hierarchy(dependency_nodes)
-        direct_dependencies = self.metastore.get_direct_dataset_dependencies(
-            dataset, version
+        root_key = (dataset_id, version_id)
+        if root_key not in children_map:
+            return []
+        root_dependency_ids = children_map[root_key]
+        root_dependencies = [dependency_map[dep_id] for dep_id in root_dependency_ids]
+        if indirect:
+            for dependency in root_dependencies:
+                if dependency is not None:
+                    populate_nested_dependencies(
+                        dependency, dependency_nodes, dependency_map, children_map
+                    )
+        return root_dependencies
+    def get_dataset_dependencies(
+        self,
+        name: str,
+        version: str,
+        namespace_name: str | None = None,
+        project_name: str | None = None,
+        indirect=False,
+    ) -> list[DatasetDependency | None]:
+        dataset = self.get_dataset(
+            name,
+            namespace_name=namespace_name,
+            project_name=project_name,
         )
+        dataset_version = dataset.get_version(version)
+        dataset_id = dataset.id
+        dataset_version_id = dataset_version.id
         if not indirect:
-            return direct_dependencies
-        for d in direct_dependencies:
-            if not d:
-                # dependency has been removed
-                continue
-            if d.is_dataset:
-                # only datasets can have dependencies
-                d.dependencies = self.get_dataset_dependencies(
-                    d.name, int(d.version), indirect=indirect
-                )
+            return self.metastore.get_direct_dataset_dependencies(
+                dataset,
+                version,
+            )
-        return direct_dependencies
+        return self.get_dataset_dependencies_by_ids(
+            dataset_id,
+            dataset_version_id,
+            indirect,
+        )
     def ls_datasets(
-        self, include_listing: bool = False, studio: bool = False
+        self,
+        prefix: str | None = None,
+        include_listing: bool = False,
+        studio: bool = False,
+        project: Project | None = None,
     ) -> Iterator[DatasetListRecord]:
         from datachain.remote.studio import StudioClient
+        project_id = project.id if project else None
         if studio:
             client = StudioClient()
-            response = client.ls_datasets()
+            response = client.ls_datasets(prefix=prefix)
             if not response.ok:
                 raise DataChainError(response.message)
             if not response.data:
@@ -1199,8 +1259,12 @@ class Catalog:
                 for d in response.data
                 if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
             )
+        elif prefix:
+            datasets = self.metastore.list_datasets_by_prefix(
+                prefix, project_id=project_id
+            )
         else:
-            datasets = self.metastore.list_datasets()
+            datasets = self.metastore.list_datasets(project_id=project_id)
         for d in datasets:
             if not d.is_bucket_listing or include_listing:
@@ -1208,50 +1272,79 @@ class Catalog:
     def list_datasets_versions(
         self,
+        prefix: str | None = None,
         include_listing: bool = False,
+        with_job: bool = True,
         studio: bool = False,
-    ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
+        project: Project | None = None,
+    ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", "Job | None"]]:
         """Iterate over all dataset versions with related jobs."""
         datasets = list(
-            self.ls_datasets(include_listing=include_listing, studio=studio)
+            self.ls_datasets(
+                prefix=prefix,
+                include_listing=include_listing,
+                studio=studio,
+                project=project,
+            )
         )
         # preselect dataset versions jobs from db to avoid multiple queries
-        jobs_ids: set[str] = {
-            v.job_id for ds in datasets for v in ds.versions if v.job_id
-        }
         jobs: dict[str, Job] = {}
-        if jobs_ids:
-            jobs = {j.id: j for j in self.metastore.list_jobs_by_ids(list(jobs_ids))}
+        if with_job:
+            jobs_ids: set[str] = {
+                v.job_id for ds in datasets for v in ds.versions if v.job_id
+            }
+            if jobs_ids:
+                jobs = {
+                    j.id: j for j in self.metastore.list_jobs_by_ids(list(jobs_ids))
+                }
         for d in datasets:
             yield from (
-                (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
+                (d, v, jobs.get(str(v.job_id)) if with_job and v.job_id else None)
                 for v in d.versions
             )
-    def listings(self):
+    def listings(self, prefix: str | None = None) -> list["ListingInfo"]:
         """
         Returns list of ListingInfo objects which are representing specific
         storage listing datasets
         """
-        from datachain.lib.listing import is_listing_dataset
+        from datachain.lib.listing import LISTING_PREFIX, is_listing_dataset
         from datachain.lib.listing_info import ListingInfo
+        if prefix and not prefix.startswith(LISTING_PREFIX):
+            prefix = LISTING_PREFIX + prefix
+        listing_datasets_versions = self.list_datasets_versions(
+            prefix=prefix,
+            include_listing=True,
+            with_job=False,
+            project=self.metastore.listing_project,
+        )
         return [
             ListingInfo.from_models(d, v, j)
-            for d, v, j in self.list_datasets_versions(include_listing=True)
+            for d, v, j in listing_datasets_versions
             if is_listing_dataset(d.name)
         ]
     def ls_dataset_rows(
-        self, name: str, version: int, offset=None, limit=None
+        self,
+        dataset: DatasetRecord,
+        version: str,
+        offset=None,
+        limit=None,
     ) -> list[dict]:
         from datachain.query.dataset import DatasetQuery
-        dataset = self.get_dataset(name)
-        q = DatasetQuery(name=dataset.name, version=version, catalog=self)
+        q = DatasetQuery(
+            name=dataset.name,
+            namespace_name=dataset.project.namespace.name,
+            project_name=dataset.project.name,
+            version=version,
+            catalog=self,
+        )
         if limit:
             q = q.limit(limit)
         if offset:
@@ -1263,9 +1356,9 @@ class Catalog:
         self,
         source: str,
         path: str,
-        version_id: Optional[str] = None,
+        version_id: str | None = None,
         client_config=None,
-        content_disposition: Optional[str] = None,
+        content_disposition: str | None = None,
         **kwargs,
     ) -> str:
         client_config = client_config or self.client_config
@@ -1283,26 +1376,42 @@ class Catalog:
         self,
         bucket_uri: str,
         name: str,
-        version: int,
+        version: str,
+        project: Project | None = None,
         client_config=None,
     ) -> list[str]:
-        dataset = self.get_dataset(name)
+        dataset = self.get_dataset(
+            name,
+            namespace_name=project.namespace.name if project else None,
+            project_name=project.name if project else None,
+        )
         return self.warehouse.export_dataset_table(
             bucket_uri, dataset, version, client_config
         )
-    def dataset_table_export_file_names(self, name: str, version: int) -> list[str]:
-        dataset = self.get_dataset(name)
+    def dataset_table_export_file_names(
+        self, name: str, version: str, project: Project | None = None
+    ) -> list[str]:
+        dataset = self.get_dataset(
+            name,
+            namespace_name=project.namespace.name if project else None,
+            project_name=project.name if project else None,
+        )
         return self.warehouse.dataset_table_export_file_names(dataset, version)
     def remove_dataset(
         self,
         name: str,
-        version: Optional[int] = None,
-        force: Optional[bool] = False,
+        project: Project | None = None,
+        version: str | None = None,
+        force: bool | None = False,
     ):
-        dataset = self.get_dataset(name)
+        dataset = self.get_dataset(
+            name,
+            namespace_name=project.namespace.name if project else None,
+            project_name=project.name if project else None,
+        )
         if not version and not force:
             raise ValueError(f"Missing dataset version from input for dataset {name}")
         if version and not dataset.has_version(version):
@@ -1324,19 +1433,25 @@ class Catalog:
     def edit_dataset(
         self,
         name: str,
-        new_name: Optional[str] = None,
-        description: Optional[str] = None,
-        labels: Optional[list[str]] = None,
+        project: Project | None = None,
+        new_name: str | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
     ) -> DatasetRecord:
         update_data = {}
         if new_name:
+            DatasetRecord.validate_name(new_name)
             update_data["name"] = new_name
         if description is not None:
             update_data["description"] = description
-        if labels is not None:
-            update_data["labels"] = labels  # type: ignore[assignment]
+        if attrs is not None:
+            update_data["attrs"] = attrs  # type: ignore[assignment]
-        dataset = self.get_dataset(name)
+        dataset = self.get_dataset(
+            name,
+            namespace_name=project.namespace.name if project else None,
+            project_name=project.name if project else None,
+        )
         return self.update_dataset(dataset, **update_data)
     def ls(
@@ -1348,22 +1463,24 @@ class Catalog:
         *,
         client_config=None,
     ) -> Iterator[tuple[DataSource, Iterable[tuple]]]:
-        data_sources = self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             skip_indexing=skip_indexing,
             client_config=client_config or self.client_config,
-        )
+        ) as data_sources:
+            if data_sources is None:
+                return
-        for source in data_sources:  # type: ignore [union-attr]
-            yield source, source.ls(fields)
+            for source in data_sources:
+                yield source, source.ls(fields)
     def pull_dataset(  # noqa: C901, PLR0915
         self,
         remote_ds_uri: str,
-        output: Optional[str] = None,
-        local_ds_name: Optional[str] = None,
-        local_ds_version: Optional[int] = None,
+        output: str | None = None,
+        local_ds_name: str | None = None,
+        local_ds_version: str | None = None,
         cp: bool = False,
         force: bool = False,
         *,
@@ -1393,7 +1510,29 @@ class Catalog:
         except Exception as e:
             raise DataChainError("Error when parsing dataset uri") from e
-        remote_ds = self.get_remote_dataset(remote_ds_name)
+        remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
+            remote_ds_name
+        )
+        if not remote_namespace or not remote_project:
+            raise DataChainError(
+                f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
+                f" or project missing"
+            )
+        if local_ds_name:
+            local_namespace, local_project, local_ds_name = parse_dataset_name(
+                local_ds_name
+            )
+            if local_namespace and local_namespace != remote_namespace:
+                raise DataChainError(
+                    "Local namespace must be the same to remote namespace"
+                )
+            if local_project and local_project != remote_project:
+                raise DataChainError("Local project must be the same to remote project")
+        remote_ds = self.get_remote_dataset(
+            remote_namespace, remote_project, remote_ds_name
+        )
         try:
             # if version is not specified in uri, take the latest one
@@ -1401,7 +1540,12 @@ class Catalog:
                 version = remote_ds.latest_version
                 print(f"Version not specified, pulling the latest one (v{version})")
                 # updating dataset uri with latest version
-                remote_ds_uri = create_dataset_uri(remote_ds_name, version)
+                remote_ds_uri = create_dataset_uri(
+                    remote_ds.name,
+                    remote_ds.project.namespace.name,
+                    remote_ds.project.name,
+                    version,
+                )
             remote_ds_version = remote_ds.get_version(version)
         except (DatasetVersionNotFoundError, StopIteration) as exc:
             raise DataChainError(
@@ -1410,7 +1554,13 @@ class Catalog:
         local_ds_name = local_ds_name or remote_ds.name
         local_ds_version = local_ds_version or remote_ds_version.version
-        local_ds_uri = create_dataset_uri(local_ds_name, local_ds_version)
+        local_ds_uri = create_dataset_uri(
+            local_ds_name,
+            remote_ds.project.namespace.name,
+            remote_ds.project.name,
+            local_ds_version,
+        )
         try:
             # try to find existing dataset with the same uuid to avoid pulling again
@@ -1419,7 +1569,10 @@ class Catalog:
                 remote_ds_version.uuid
             )
             existing_ds_uri = create_dataset_uri(
-                existing_ds.name, existing_ds_version.version
+                existing_ds.name,
+                existing_ds.project.namespace.name,
+                existing_ds.project.name,
+                existing_ds_version.version,
             )
             if existing_ds_uri == remote_ds_uri:
                 print(f"Local copy of dataset {remote_ds_uri} already present")
@@ -1433,8 +1586,30 @@ class Catalog:
         except DatasetNotFoundError:
             pass
+        # Create namespace and project if doesn't exist
+        print(
+            f"Creating namespace {remote_ds.project.namespace.name} and project"
+            f" {remote_ds.project.name}"
+        )
+        namespace = self.metastore.create_namespace(
+            remote_ds.project.namespace.name,
+            description=remote_ds.project.namespace.descr,
+            uuid=remote_ds.project.namespace.uuid,
+            validate=False,
+        )
+        project = self.metastore.create_project(
+            namespace.name,
+            remote_ds.project.name,
+            description=remote_ds.project.descr,
+            uuid=remote_ds.project.uuid,
+            validate=False,
+        )
         try:
-            local_dataset = self.get_dataset(local_ds_name)
+            local_dataset = self.get_dataset(
+                local_ds_name, namespace_name=namespace.name, project_name=project.name
+            )
             if local_dataset and local_dataset.has_version(local_ds_version):
                 raise DataChainError(
                     f"Local dataset {local_ds_uri} already exists with different uuid,"
@@ -1452,10 +1627,11 @@ class Catalog:
             leave=False,
         )
-        schema = DatasetRecord.parse_schema(remote_ds_version.schema)
+        schema = parse_schema(remote_ds_version.schema)
         local_ds = self.create_dataset(
             local_ds_name,
+            project,
             local_ds_version,
             query_script=remote_ds_version.query_script,
             create_rows=True,
@@ -1468,7 +1644,7 @@ class Catalog:
         # asking remote to export dataset rows table to s3 and to return signed
         # urls of exported parts, which are in parquet format
         export_response = studio_client.export_dataset_table(
-            remote_ds_name, remote_ds_version.version
+            remote_ds, remote_ds_version.version
         )
         if not export_response.ok:
             raise DataChainError(export_response.message)
@@ -1499,9 +1675,9 @@ class Catalog:
                 rows_fetcher = DatasetRowsFetcher(
                     metastore,
                     warehouse,
-                    remote_ds_name,
+                    remote_ds,
                     remote_ds_version.version,
-                    local_ds_name,
+                    local_ds,
                     local_ds_version,
                     schema,
                     progress_bar=dataset_save_progress_bar,
@@ -1511,7 +1687,7 @@ class Catalog:
                         iter(batch(signed_urls)), dataset_save_progress_bar
                     )
                 except:
-                    self.remove_dataset(local_ds_name, local_ds_version)
+                    self.remove_dataset(local_ds_name, project, local_ds_version)
                     raise
         local_ds = self.metastore.update_dataset_status(
@@ -1561,92 +1737,20 @@ class Catalog:
         else:
             # since we don't call cp command, which does listing implicitly,
             # it needs to be done here
-            self.enlist_sources(
+            with self.enlist_sources(
                 sources,
                 update,
                 client_config=client_config or self.client_config,
-            )
+            ):
+                pass
         self.create_dataset_from_sources(
-            output, sources, client_config=client_config, recursive=recursive
-        )
-    def query(
-        self,
-        query_script: str,
-        env: Optional[Mapping[str, str]] = None,
-        python_executable: str = sys.executable,
-        capture_output: bool = False,
-        output_hook: Callable[[str], None] = noop,
-        params: Optional[dict[str, str]] = None,
-        job_id: Optional[str] = None,
-        interrupt_timeout: Optional[int] = None,
-        terminate_timeout: Optional[int] = None,
-    ) -> None:
-        cmd = [python_executable, "-c", query_script]
-        env = dict(env or os.environ)
-        env.update(
-            {
-                "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
-                "DATACHAIN_JOB_ID": job_id or "",
-            },
+            output,
+            sources,
+            self.metastore.default_project,
+            client_config=client_config,
+            recursive=recursive,
         )
-        popen_kwargs: dict[str, Any] = {}
-        if capture_output:
-            popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
-        def raise_termination_signal(sig: int, _: Any) -> NoReturn:
-            raise TerminationSignal(sig)
-        thread: Optional[Thread] = None
-        with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc:  # noqa: S603
-            logger.info("Starting process %s", proc.pid)
-            orig_sigint_handler = signal.getsignal(signal.SIGINT)
-            # ignore SIGINT in the main process.
-            # In the terminal, SIGINTs are received by all the processes in
-            # the foreground process group, so the script will receive the signal too.
-            # (If we forward the signal to the child, it will receive it twice.)
-            signal.signal(signal.SIGINT, signal.SIG_IGN)
-            orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
-            signal.signal(signal.SIGTERM, raise_termination_signal)
-            try:
-                if capture_output:
-                    args = (proc.stdout, output_hook)
-                    thread = Thread(target=_process_stream, args=args, daemon=True)
-                    thread.start()
-                proc.wait()
-            except TerminationSignal as exc:
-                signal.signal(signal.SIGTERM, orig_sigterm_handler)
-                signal.signal(signal.SIGINT, orig_sigint_handler)
-                logger.info("Shutting down process %s, received %r", proc.pid, exc)
-                # Rather than forwarding the signal to the child, we try to shut it down
-                # gracefully. This is because we consider the script to be interactive
-                # and special, so we give it time to cleanup before exiting.
-                shutdown_process(proc, interrupt_timeout, terminate_timeout)
-                if proc.returncode:
-                    raise QueryScriptCancelError(
-                        "Query script was canceled by user", return_code=proc.returncode
-                    ) from exc
-            finally:
-                signal.signal(signal.SIGTERM, orig_sigterm_handler)
-                signal.signal(signal.SIGINT, orig_sigint_handler)
-                if thread:
-                    thread.join()  # wait for the reader thread
-        logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
-        if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE:
-            raise QueryScriptCancelError(
-                "Query script was canceled by user",
-                return_code=proc.returncode,
-            )
-        if proc.returncode:
-            raise QueryScriptRunError(
-                f"Query script exited with error code {proc.returncode}",
-                return_code=proc.returncode,
-            )
     def cp(
         self,
@@ -1658,7 +1762,7 @@ class Catalog:
         no_cp: bool = False,
         no_glob: bool = False,
         *,
-        client_config: Optional["dict"] = None,
+        client_config: dict | None = None,
     ) -> None:
         """
         This function copies files from cloud sources to local destination directory
@@ -1671,38 +1775,42 @@ class Catalog:
             no_glob,
             client_config=client_config,
         )
+        try:
+            always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
+                node_groups, output, force, no_cp
+            )
+            total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
+            if not total_files:
+                return
-        always_copy_dir_contents, copy_to_filename = prepare_output_for_cp(
-            node_groups, output, force, no_cp
-        )
-        total_size, total_files = collect_nodes_for_cp(node_groups, recursive)
-        if not total_files:
-            return
-        desc_max_len = max(len(output) + 16, 19)
-        bar_format = (
-            "{desc:<"
-            f"{desc_max_len}"
-            "}{percentage:3.0f}%|{bar}| {n_fmt:>5}/{total_fmt:<5} "
-            "[{elapsed}<{remaining}, {rate_fmt:>8}]"
-        )
+            desc_max_len = max(len(output) + 16, 19)
+            bar_format = (
+                "{desc:<"
+                f"{desc_max_len}"
+                "}{percentage:3.0f}%|{bar}| {n_fmt:>5}/{total_fmt:<5} "
+                "[{elapsed}<{remaining}, {rate_fmt:>8}]"
+            )
-        if not no_cp:
-            with get_download_bar(bar_format, total_size) as pbar:
-                for node_group in node_groups:
-                    node_group.download(recursive=recursive, pbar=pbar)
+            if not no_cp:
+                with get_download_bar(bar_format, total_size) as pbar:
+                    for node_group in node_groups:
+                        node_group.download(recursive=recursive, pbar=pbar)
-        instantiate_node_groups(
-            node_groups,
-            output,
-            bar_format,
-            total_files,
-            force,
-            recursive,
-            no_cp,
-            always_copy_dir_contents,
-            copy_to_filename,
-        )
+            instantiate_node_groups(
+                node_groups,
+                output,
+                bar_format,
+                total_files,
+                force,
+                recursive,
+                no_cp,
+                always_copy_dir_contents,
+                copy_to_filename,
+            )
+        finally:
+            for node_group in node_groups:
+                with suppress(Exception):
+                    node_group.close()
     def du(
         self,
@@ -1712,24 +1820,26 @@ class Catalog:
         *,
         client_config=None,
     ) -> Iterable[tuple[str, float]]:
-        sources = self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             client_config=client_config or self.client_config,
-        )
+        ) as matched_sources:
+            if matched_sources is None:
+                return
-        def du_dirs(src, node, subdepth):
-            if subdepth > 0:
-                subdirs = src.listing.get_dirs_by_parent_path(node.path)
-                for sd in subdirs:
-                    yield from du_dirs(src, sd, subdepth - 1)
-            yield (
-                src.get_node_full_path(node),
-                src.listing.du(node)[0],
-            )
+            def du_dirs(src, node, subdepth):
+                if subdepth > 0:
+                    subdirs = src.listing.get_dirs_by_parent_path(node.path)
+                    for sd in subdirs:
+                        yield from du_dirs(src, sd, subdepth - 1)
+                yield (
+                    src.get_node_full_path(node),
+                    src.listing.du(node)[0],
+                )
-        for src in sources:
-            yield from du_dirs(src, src.node, depth)
+            for src in matched_sources:
+                yield from du_dirs(src, src.node, depth)
     def find(
         self,
@@ -1745,39 +1855,42 @@ class Catalog:
         *,
         client_config=None,
     ) -> Iterator[str]:
-        sources = self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             client_config=client_config or self.client_config,
-        )
-        if not columns:
-            columns = ["path"]
-        field_set = set()
-        for column in columns:
-            if column == "du":
-                field_set.add("dir_type")
-                field_set.add("size")
-                field_set.add("path")
-            elif column == "name":
-                field_set.add("path")
-            elif column == "path":
-                field_set.add("dir_type")
-                field_set.add("path")
-            elif column == "size":
-                field_set.add("size")
-            elif column == "type":
-                field_set.add("dir_type")
-        fields = list(field_set)
-        field_lookup = {f: i for i, f in enumerate(fields)}
-        for src in sources:
-            results = src.listing.find(
-                src.node, fields, names, inames, paths, ipaths, size, typ
-            )
-            for row in results:
-                yield "\t".join(
-                    find_column_to_str(row, field_lookup, src, column)
-                    for column in columns
+        ) as matched_sources:
+            if matched_sources is None:
+                return
+            if not columns:
+                columns = ["path"]
+            field_set = set()
+            for column in columns:
+                if column == "du":
+                    field_set.add("dir_type")
+                    field_set.add("size")
+                    field_set.add("path")
+                elif column == "name":
+                    field_set.add("path")
+                elif column == "path":
+                    field_set.add("dir_type")
+                    field_set.add("path")
+                elif column == "size":
+                    field_set.add("size")
+                elif column == "type":
+                    field_set.add("dir_type")
+            fields = list(field_set)
+            field_lookup = {f: i for i, f in enumerate(fields)}
+            for src in matched_sources:
+                results = src.listing.find(
+                    src.node, fields, names, inames, paths, ipaths, size, typ
                 )
+                for row in results:
+                    yield "\t".join(
+                        find_column_to_str(row, field_lookup, src, column)
+                        for column in columns
+                    )
     def index(
         self,
@@ -1786,9 +1899,10 @@ class Catalog:
         *,
         client_config=None,
     ) -> None:
-        self.enlist_sources(
+        with self.enlist_sources(
             sources,
             update,
             client_config=client_config or self.client_config,
             only_index=True,
-        )
+        ):
+            pass

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl