PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/query/dataset.py CHANGED Viewed

@@ -1,25 +1,18 @@
 import contextlib
+import hashlib
 import inspect
 import logging
 import os
-import random
+import secrets
 import string
 import subprocess
 import sys
 from abc import ABC, abstractmethod
-from collections.abc import Generator, Iterable, Iterator, Sequence
+from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
 from copy import copy
 from functools import wraps
-from secrets import token_hex
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Optional,
-    Protocol,
-    TypeVar,
-    Union,
-)
+from types import GeneratorType
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 import attrs
 import sqlalchemy
@@ -28,7 +21,7 @@ from attrs import frozen
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
 from sqlalchemy import Column
 from sqlalchemy.sql import func as f
-from sqlalchemy.sql.elements import ColumnClause, ColumnElement
+from sqlalchemy.sql.elements import ColumnClause, ColumnElement, Label
 from sqlalchemy.sql.expression import label
 from sqlalchemy.sql.schema import TableClause
 from sqlalchemy.sql.selectable import Select
@@ -41,51 +34,53 @@ from datachain.data_storage.schema import (
     partition_col_names,
     partition_columns,
 )
-from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
-from datachain.error import (
-    DatasetNotFoundError,
-    QueryScriptCancelError,
-)
+from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
+from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
-from datachain.lib.listing import (
-    is_listing_dataset,
-    listing_dataset_expired,
-)
+from datachain.hash_utils import hash_column_elements
+from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
+from datachain.lib.signal_schema import SignalSchema, generate_merge_root_mapping
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
-from datachain.query.schema import C, UDFParamSpec, normalize_param
+from datachain.project import Project
+from datachain.query.schema import DEFAULT_DELIMITER, C, UDFParamSpec, normalize_param
 from datachain.query.session import Session
+from datachain.query.udf import UdfInfo
 from datachain.sql.functions.random import rand
+from datachain.sql.types import SQLType
 from datachain.utils import (
-    batched,
     determine_processes,
+    determine_workers,
+    ensure_sequence,
     filtered_cloudpickle_dumps,
     get_datachain_executable,
     safe_closing,
 )
 if TYPE_CHECKING:
-    from sqlalchemy.sql.elements import ClauseElement
+    from collections.abc import Mapping
+    from typing import Concatenate
+    from sqlalchemy.sql.elements import ClauseElement, KeyedColumnElement
     from sqlalchemy.sql.schema import Table
     from sqlalchemy.sql.selectable import GenerativeSelect
-    from typing_extensions import Concatenate, ParamSpec, Self
+    from typing_extensions import ParamSpec, Self
     from datachain.catalog import Catalog
     from datachain.data_storage import AbstractWarehouse
     from datachain.dataset import DatasetRecord
     from datachain.lib.udf import UDFAdapter, UDFResult
-    from datachain.query.udf import UdfInfo
     P = ParamSpec("P")
 INSERT_BATCH_SIZE = 10000
-PartitionByType = Union[
-    Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
-]
-JoinPredicateType = Union[str, ColumnClause, ColumnElement]
-DatasetDependencyType = tuple[str, int]
+PartitionByType = (
+    str | Function | ColumnElement | Sequence[str | Function | ColumnElement]
+)
+JoinPredicateType = str | ColumnClause | ColumnElement
+DatasetDependencyType = tuple["DatasetRecord", str]
 logger = logging.getLogger("datachain")
@@ -165,24 +160,42 @@ class Step(ABC):
     ) -> "StepResult":
         """Apply the processing step."""
+    @abstractmethod
+    def hash_inputs(self) -> str:
+        """Calculates hash of step inputs"""
+    def hash(self) -> str:
+        """
+        Calculates hash for step which includes step name and hash of it's inputs
+        """
+        return hashlib.sha256(
+            f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
+        ).hexdigest()
 @frozen
 class QueryStep:
+    """A query that returns all rows from specific dataset version"""
     catalog: "Catalog"
-    dataset_name: str
-    dataset_version: int
+    dataset: "DatasetRecord"
+    dataset_version: str
-    def apply(self):
+    def apply(self) -> "StepResult":
         def q(*columns):
             return sqlalchemy.select(*columns)
-        dataset = self.catalog.get_dataset(self.dataset_name)
-        dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
+        dr = self.catalog.warehouse.dataset_rows(self.dataset, self.dataset_version)
         return step_result(
-            q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
+            q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
         )
+    def hash(self) -> str:
+        return hashlib.sha256(
+            self.dataset.uri(self.dataset_version).encode()
+        ).hexdigest()
 def generator_then_call(generator, func: Callable):
     """
@@ -218,8 +231,9 @@ class DatasetDiffOperation(Step):
     def apply(self, query_generator, temp_tables: list[str]) -> "StepResult":
         source_query = query_generator.exclude(("sys__id",))
+        right_before = len(self.dq.temp_table_names)
         target_query = self.dq.apply_steps().select()
-        temp_tables.extend(self.dq.temp_table_names)
+        temp_tables.extend(self.dq.temp_table_names[right_before:])
         # creating temp table that will hold subtract results
         temp_table_name = self.catalog.warehouse.temp_table_name()
@@ -253,6 +267,13 @@ class DatasetDiffOperation(Step):
 class Subtract(DatasetDiffOperation):
     on: Sequence[tuple[str, str]]
+    def hash_inputs(self) -> str:
+        on_bytes = b"".join(
+            f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
+        )
+        return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
     def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
         sq = source_query.alias("source_query")
         tq = target_query.alias("target_query")
@@ -272,7 +293,9 @@ class Subtract(DatasetDiffOperation):
 def adjust_outputs(
-    warehouse: "AbstractWarehouse", row: dict[str, Any], udf_col_types: list[tuple]
+    warehouse: "AbstractWarehouse",
+    row: dict[str, Any],
+    col_types: list[tuple[str, SQLType, type, str, Any]],
 ) -> dict[str, Any]:
     """
     This function does a couple of things to prepare a row for inserting into the db:
@@ -288,7 +311,7 @@ def adjust_outputs(
         col_python_type,
         col_type_name,
         default_value,
-    ) in udf_col_types:
+    ) in col_types:
         row_val = row.get(col_name)
         # Fill None or missing values with defaults (get returns None if not in the row)
@@ -303,8 +326,10 @@ def adjust_outputs(
     return row
-def get_udf_col_types(warehouse: "AbstractWarehouse", udf: "UDFAdapter") -> list[tuple]:
-    """Optimization: Precompute UDF column types so these don't have to be computed
+def get_col_types(
+    warehouse: "AbstractWarehouse", output: "Mapping[str, Any]"
+) -> list[tuple]:
+    """Optimization: Precompute column types so these don't have to be computed
     in the convert_type function for each row in a loop."""
     dialect = warehouse.db.dialect
     return [
@@ -316,7 +341,7 @@ def get_udf_col_types(warehouse: "AbstractWarehouse", udf: "UDFAdapter") -> list
             type(col_type_inst).__name__,
             col_type.default_value(dialect),
         )
-        for col_name, col_type in udf.output.items()
+        for col_name, col_type in output.items()
     ]
@@ -325,33 +350,23 @@ def process_udf_outputs(
     udf_table: "Table",
     udf_results: Iterator[Iterable["UDFResult"]],
     udf: "UDFAdapter",
-    batch_size: int = INSERT_BATCH_SIZE,
     cb: Callback = DEFAULT_CALLBACK,
+    batch_size: int = INSERT_BATCH_SIZE,
 ) -> None:
-    import psutil
-    rows: list[UDFResult] = []
     # Optimization: Compute row types once, rather than for every row.
-    udf_col_types = get_udf_col_types(warehouse, udf)
+    udf_col_types = get_col_types(warehouse, udf.output)
-    for udf_output in udf_results:
-        if not udf_output:
-            continue
-        with safe_closing(udf_output):
-            for row in udf_output:
-                cb.relative_update()
-                rows.append(adjust_outputs(warehouse, row, udf_col_types))
-                if len(rows) >= batch_size or (
-                    len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
-                ):
-                    for row_chunk in batched(rows, batch_size):
-                        warehouse.insert_rows(udf_table, row_chunk)
-                    rows.clear()
+    def _insert_rows():
+        for udf_output in udf_results:
+            if not udf_output:
+                continue
-    if rows:
-        for row_chunk in batched(rows, batch_size):
-            warehouse.insert_rows(udf_table, row_chunk)
+            with safe_closing(udf_output):
+                for row in udf_output:
+                    cb.relative_update()
+                    yield adjust_outputs(warehouse, row, udf_col_types)
+    warehouse.insert_rows(udf_table, _insert_rows(), batch_size=batch_size)
     warehouse.insert_rows_done(udf_table)
@@ -387,20 +402,34 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
 class UDFStep(Step, ABC):
     udf: "UDFAdapter"
     catalog: "Catalog"
-    partition_by: Optional[PartitionByType] = None
-    parallel: Optional[int] = None
-    workers: Union[bool, int] = False
-    min_task_size: Optional[int] = None
+    partition_by: PartitionByType | None = None
     is_generator = False
+    # Parameters from Settings
     cache: bool = False
+    parallel: int | None = None
+    workers: bool | int = False
+    min_task_size: int | None = None
+    batch_size: int | None = None
+    def hash_inputs(self) -> str:
+        partition_by = ensure_sequence(self.partition_by or [])
+        parts = [
+            bytes.fromhex(self.udf.hash()),
+            bytes.fromhex(hash_column_elements(partition_by)),
+            str(self.is_generator).encode(),
+        ]
+        return hashlib.sha256(b"".join(parts)).hexdigest()
     @abstractmethod
     def create_udf_table(self, query: Select) -> "Table":
         """Method that creates a table where temp udf results will be saved"""
     def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]:
-        """Apply any necessary processing to the input query"""
-        return query, []
+        """Materialize inputs, ensure sys columns are available, needed for checkpoints,
+        needed for map to work (merge results)"""
+        table = self.catalog.warehouse.create_pre_udf_table(query)
+        return sqlalchemy.select(*table.c), [table]
     @abstractmethod
     def create_result_query(
@@ -412,28 +441,48 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
+        if (rows_total := self.catalog.warehouse.query_count(query)) == 0:
+            return
         from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
+        from datachain.catalog.loader import (
+            DISTRIBUTED_IMPORT_PATH,
+            get_udf_distributor_class,
+        )
+        workers = determine_workers(self.workers, rows_total=rows_total)
+        processes = determine_processes(self.parallel, rows_total=rows_total)
         use_partitioning = self.partition_by is not None
         batching = self.udf.get_batching(use_partitioning)
-        workers = self.workers
-        if (
-            not workers
-            and os.environ.get("DATACHAIN_DISTRIBUTED")
-            and os.environ.get("DATACHAIN_SETTINGS_WORKERS")
-        ):
-            # Enable distributed processing by default if the module is available,
-            # and a default number of workers is provided.
-            workers = True
-        processes = determine_processes(self.parallel)
         udf_fields = [str(c.name) for c in query.selected_columns]
+        udf_distributor_class = get_udf_distributor_class()
         prefetch = self.udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
             catalog = clone_catalog_with_cache(self.catalog, _cache)
             try:
+                if udf_distributor_class and not catalog.in_memory:
+                    # Use the UDF distributor if available (running in SaaS)
+                    udf_distributor = udf_distributor_class(
+                        catalog=catalog,
+                        table=udf_table,
+                        query=query,
+                        udf_data=filtered_cloudpickle_dumps(self.udf),
+                        batching=batching,
+                        workers=workers,
+                        processes=processes,
+                        udf_fields=udf_fields,
+                        rows_total=rows_total,
+                        use_cache=self.cache,
+                        is_generator=self.is_generator,
+                        min_task_size=self.min_task_size,
+                        batch_size=self.batch_size,
+                    )
+                    udf_distributor()
+                    return
                 if workers:
                     if catalog.in_memory:
                         raise RuntimeError(
@@ -441,43 +490,33 @@ class UDFStep(Step, ABC):
                             "distributed processing."
                         )
-                    from datachain.catalog.loader import get_distributed_class
-                    distributor = get_distributed_class(
-                        min_task_size=self.min_task_size
+                    raise RuntimeError(
+                        f"{DISTRIBUTED_IMPORT_PATH} import path is required "
+                        "for distributed UDF processing."
                     )
-                    distributor(
-                        self.udf,
-                        catalog,
-                        udf_table,
-                        query,
-                        workers,
-                        processes,
-                        udf_fields=udf_fields,
-                        is_generator=self.is_generator,
-                        use_partitioning=use_partitioning,
-                        cache=self.cache,
-                    )
-                elif processes:
+                if processes:
                     # Parallel processing (faster for more CPU-heavy UDFs)
                     if catalog.in_memory:
                         raise RuntimeError(
                             "In-memory databases cannot be used "
                             "with parallel processing."
                         )
-                    udf_info: UdfInfo = {
-                        "udf_data": filtered_cloudpickle_dumps(self.udf),
-                        "catalog_init": catalog.get_init_params(),
-                        "metastore_clone_params": catalog.metastore.clone_params(),
-                        "warehouse_clone_params": catalog.warehouse.clone_params(),
-                        "table": udf_table,
-                        "query": query,
-                        "udf_fields": udf_fields,
-                        "batching": batching,
-                        "processes": processes,
-                        "is_generator": self.is_generator,
-                        "cache": self.cache,
-                    }
+                    udf_info = UdfInfo(
+                        udf_data=filtered_cloudpickle_dumps(self.udf),
+                        catalog_init=catalog.get_init_params(),
+                        metastore_clone_params=catalog.metastore.clone_params(),
+                        warehouse_clone_params=catalog.warehouse.clone_params(),
+                        table=udf_table,
+                        query=query,
+                        udf_fields=udf_fields,
+                        batching=batching,
+                        processes=processes,
+                        is_generator=self.is_generator,
+                        cache=self.cache,
+                        rows_total=rows_total,
+                        batch_size=self.batch_size or INSERT_BATCH_SIZE,
+                    )
                     # Run the UDFDispatcher in another process to avoid needing
                     # if __name__ == '__main__': in user scripts
@@ -490,7 +529,12 @@ class UDFStep(Step, ABC):
                     with subprocess.Popen(  # noqa: S603
                         cmd, env=envs, stdin=subprocess.PIPE
                     ) as process:
-                        process.communicate(process_data)
+                        try:
+                            process.communicate(process_data)
+                        except KeyboardInterrupt:
+                            raise QueryScriptCancelError(
+                                "UDF execution was canceled by the user."
+                            ) from None
                         if retval := process.poll():
                             raise RuntimeError(
                                 f"UDF Execution Failed! Exit code: {retval}"
@@ -520,6 +564,7 @@ class UDFStep(Step, ABC):
                                 udf_results,
                                 self.udf,
                                 cb=generated_cb,
+                                batch_size=self.batch_size or INSERT_BATCH_SIZE,
                             )
                     finally:
                         download_cb.close()
@@ -538,10 +583,13 @@ class UDFStep(Step, ABC):
         """
         Create temporary table with group by partitions.
         """
-        assert self.partition_by is not None
+        if self.partition_by is None:
+            raise RuntimeError("Query must have partition_by set to use partitioning")
+        if (id_col := query.selected_columns.get("sys__id")) is None:
+            raise RuntimeError("Query must have sys__id column to use partitioning")
-        if isinstance(self.partition_by, Sequence):
-            list_partition_by = self.partition_by
+        if isinstance(self.partition_by, (list, tuple, GeneratorType)):
+            list_partition_by = list(self.partition_by)
         else:
             list_partition_by = [self.partition_by]
@@ -554,16 +602,19 @@ class UDFStep(Step, ABC):
         # fill table with partitions
         cols = [
-            query.selected_columns.sys__id,
+            id_col,
             f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
         ]
         self.catalog.warehouse.db.execute(
-            tbl.insert().from_select(cols, query.with_only_columns(*cols))
+            tbl.insert().from_select(
+                cols,
+                query.offset(None).limit(None).with_only_columns(*cols),
+            )
         )
         return tbl
-    def clone(self, partition_by: Optional[PartitionByType] = None) -> "Self":
+    def clone(self, partition_by: PartitionByType | None = None) -> "Self":
         if partition_by is not None:
             return self.__class__(
                 self.udf,
@@ -572,27 +623,25 @@ class UDFStep(Step, ABC):
                 parallel=self.parallel,
                 workers=self.workers,
                 min_task_size=self.min_task_size,
+                batch_size=self.batch_size,
             )
         return self.__class__(self.udf, self.catalog)
     def apply(
         self, query_generator: QueryGenerator, temp_tables: list[str]
     ) -> "StepResult":
-        _query = query = query_generator.select()
+        query, tables = self.process_input_query(query_generator.select())
+        _query = query
         # Apply partitioning if needed.
         if self.partition_by is not None:
             partition_tbl = self.create_partitions_table(query)
-            temp_tables.append(partition_tbl.name)
+            query = query.outerjoin(
+                partition_tbl,
+                partition_tbl.c.sys__id == query.selected_columns.sys__id,
+            ).add_columns(*partition_columns())
+            tables = [*tables, partition_tbl]
-            subq = query.subquery()
-            query = (
-                sqlalchemy.select(*subq.c)
-                .outerjoin(partition_tbl, partition_tbl.c.sys__id == subq.c.sys__id)
-                .add_columns(*partition_columns())
-            )
-        query, tables = self.process_input_query(query)
         temp_tables.extend(t.name for t in tables)
         udf_table = self.create_udf_table(_query)
         temp_tables.append(udf_table.name)
@@ -604,7 +653,16 @@ class UDFStep(Step, ABC):
 @frozen
 class UDFSignal(UDFStep):
+    udf: "UDFAdapter"
+    catalog: "Catalog"
+    partition_by: PartitionByType | None = None
     is_generator = False
+    # Parameters from Settings
+    cache: bool = False
+    parallel: int | None = None
+    workers: bool | int = False
+    min_task_size: int | None = None
+    batch_size: int | None = None
     def create_udf_table(self, query: Select) -> "Table":
         udf_output_columns: list[sqlalchemy.Column[Any]] = [
@@ -614,13 +672,6 @@ class UDFSignal(UDFStep):
         return self.catalog.warehouse.create_udf_table(udf_output_columns)
-    def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]:
-        if os.getenv("DATACHAIN_DISABLE_QUERY_CACHE", "") not in ("", "0"):
-            return query, []
-        table = self.catalog.warehouse.create_pre_udf_table(query)
-        q: Select = sqlalchemy.select(*table.c)
-        return q, [table]
     def create_result_query(
         self, udf_table, query
     ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
@@ -628,15 +679,30 @@ class UDFSignal(UDFStep):
         original_cols = [c for c in subq.c if c.name not in partition_col_names]
         # new signal columns that are added to udf_table
-        signal_cols = [c for c in udf_table.c if c.name != "sys__id"]
+        signal_cols = [c for c in udf_table.c if not c.name.startswith("sys__")]
         signal_name_cols = {c.name: c for c in signal_cols}
         cols = signal_cols
-        overlap = {c.name for c in original_cols} & {c.name for c in cols}
+        original_names = {c.name for c in original_cols}
+        new_names = {c.name for c in cols}
+        overlap = original_names & new_names
         if overlap:
             raise ValueError(
                 "Column already exists or added in the previous steps: "
-                + ", ".join(overlap)
+                + ", ".join(sorted(overlap))
+            )
+        def _root(name: str) -> str:
+            return name.split(DEFAULT_DELIMITER, 1)[0]
+        existing_roots = {_root(name) for name in original_names}
+        new_roots = {_root(name) for name in new_names}
+        root_conflicts = existing_roots & new_roots
+        if root_conflicts:
+            raise ValueError(
+                "Signals already exist in the previous steps: "
+                + ", ".join(sorted(root_conflicts))
             )
         def q(*columns):
@@ -674,7 +740,16 @@ class UDFSignal(UDFStep):
 class RowGenerator(UDFStep):
     """Extend dataset with new rows."""
+    udf: "UDFAdapter"
+    catalog: "Catalog"
+    partition_by: PartitionByType | None = None
     is_generator = True
+    # Parameters from Settings
+    cache: bool = False
+    parallel: int | None = None
+    workers: bool | int = False
+    min_task_size: int | None = None
+    batch_size: int | None = None
     def create_udf_table(self, query: Select) -> "Table":
         warehouse = self.catalog.warehouse
@@ -721,18 +796,42 @@ class SQLClause(Step, ABC):
     def parse_cols(
         self,
-        cols: Sequence[Union[Function, ColumnElement]],
+        cols: Sequence[Function | ColumnElement],
     ) -> tuple[ColumnElement, ...]:
         return tuple(c.get_column() if isinstance(c, Function) else c for c in cols)
     @abstractmethod
-    def apply_sql_clause(self, query):
+    def apply_sql_clause(self, query: Any) -> Any:
         pass
+@frozen
+class RegenerateSystemColumns(Step):
+    catalog: "Catalog"
+    def hash_inputs(self) -> str:
+        return hashlib.sha256(b"regenerate_system_columns").hexdigest()
+    def apply(
+        self, query_generator: QueryGenerator, temp_tables: list[str]
+    ) -> StepResult:
+        query = query_generator.select()
+        new_query = self.catalog.warehouse._regenerate_system_columns(
+            query, keep_existing_columns=True
+        )
+        def q(*columns):
+            return new_query.with_only_columns(*columns)
+        return step_result(q, new_query.selected_columns)
 @frozen
 class SQLSelect(SQLClause):
-    args: tuple[Union[Function, ColumnElement], ...]
+    args: tuple[Function | ColumnElement, ...]
+    def hash_inputs(self) -> str:
+        return hash_column_elements(self.args)
     def apply_sql_clause(self, query) -> Select:
         subquery = query.subquery()
@@ -748,7 +847,10 @@ class SQLSelect(SQLClause):
 @frozen
 class SQLSelectExcept(SQLClause):
-    args: tuple[Union[Function, ColumnElement], ...]
+    args: tuple[Function | ColumnElement, ...]
+    def hash_inputs(self) -> str:
+        return hash_column_elements(self.args)
     def apply_sql_clause(self, query: Select) -> Select:
         subquery = query.subquery()
@@ -758,33 +860,43 @@ class SQLSelectExcept(SQLClause):
 @frozen
 class SQLMutate(SQLClause):
-    args: tuple[Union[Function, ColumnElement], ...]
+    args: tuple[Label, ...]
+    new_schema: SignalSchema
+    def hash_inputs(self) -> str:
+        return hash_column_elements(self.args)
     def apply_sql_clause(self, query: Select) -> Select:
         original_subquery = query.subquery()
-        args = [
-            original_subquery.c[str(c)] if isinstance(c, (str, C)) else c
-            for c in self.parse_cols(self.args)
-        ]
-        to_mutate = {c.name for c in args}
+        to_mutate = {c.name for c in self.args}
-        prefix = f"mutate{token_hex(8)}_"
-        cols = [
-            c.label(prefix + c.name) if c.name in to_mutate else c
+        # Drop the original versions to avoid name collisions, exclude renamed
+        # columns. Always keep system columns (sys__*) if they exist in original query
+        new_schema_columns = set(self.new_schema.db_signals())
+        base_cols = [
+            c
             for c in original_subquery.c
+            if c.name not in to_mutate
+            and (c.name in new_schema_columns or c.name.startswith("sys__"))
         ]
-        # this is needed for new column to be used in clauses
-        # like ORDER BY, otherwise new column is not recognized
-        subquery = (
-            sqlalchemy.select(*cols, *args).select_from(original_subquery).subquery()
+        # Create intermediate subquery to properly handle window functions
+        intermediate_query = sqlalchemy.select(*base_cols, *self.args).select_from(
+            original_subquery
         )
+        intermediate_subquery = intermediate_query.subquery()
-        return sqlalchemy.select(*subquery.c).select_from(subquery)
+        return sqlalchemy.select(*intermediate_subquery.c).select_from(
+            intermediate_subquery
+        )
 @frozen
 class SQLFilter(SQLClause):
-    expressions: tuple[Union[Function, ColumnElement], ...]
+    expressions: tuple[Function | ColumnElement, ...]
+    def hash_inputs(self) -> str:
+        return hash_column_elements(self.expressions)
     def __and__(self, other):
         expressions = self.parse_cols(self.expressions)
@@ -797,7 +909,10 @@ class SQLFilter(SQLClause):
 @frozen
 class SQLOrderBy(SQLClause):
-    args: tuple[Union[Function, ColumnElement], ...]
+    args: tuple[Function | ColumnElement, ...]
+    def hash_inputs(self) -> str:
+        return hash_column_elements(self.args)
     def apply_sql_clause(self, query: Select) -> Select:
         args = self.parse_cols(self.args)
@@ -808,6 +923,9 @@ class SQLOrderBy(SQLClause):
 class SQLLimit(SQLClause):
     n: int
+    def hash_inputs(self) -> str:
+        return hashlib.sha256(str(self.n).encode()).hexdigest()
     def apply_sql_clause(self, query: Select) -> Select:
         return query.limit(self.n)
@@ -816,12 +934,18 @@ class SQLLimit(SQLClause):
 class SQLOffset(SQLClause):
     offset: int
+    def hash_inputs(self) -> str:
+        return hashlib.sha256(str(self.offset).encode()).hexdigest()
     def apply_sql_clause(self, query: "GenerativeSelect"):
         return query.offset(self.offset)
 @frozen
 class SQLCount(SQLClause):
+    def hash_inputs(self) -> str:
+        return ""
     def apply_sql_clause(self, query):
         return sqlalchemy.select(f.count(1)).select_from(query.subquery())
@@ -831,6 +955,9 @@ class SQLDistinct(SQLClause):
     args: tuple[ColumnElement, ...]
     dialect: str
+    def hash_inputs(self) -> str:
+        return hash_column_elements(self.args)
     def apply_sql_clause(self, query):
         if self.dialect == "sqlite":
             return query.group_by(*self.args)
@@ -843,24 +970,34 @@ class SQLUnion(Step):
     query1: "DatasetQuery"
     query2: "DatasetQuery"
+    def hash_inputs(self) -> str:
+        return hashlib.sha256(
+            bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
+        ).hexdigest()
     def apply(
         self, query_generator: QueryGenerator, temp_tables: list[str]
     ) -> StepResult:
+        left_before = len(self.query1.temp_table_names)
         q1 = self.query1.apply_steps().select().subquery()
-        temp_tables.extend(self.query1.temp_table_names)
+        temp_tables.extend(self.query1.temp_table_names[left_before:])
+        right_before = len(self.query2.temp_table_names)
         q2 = self.query2.apply_steps().select().subquery()
-        temp_tables.extend(self.query2.temp_table_names)
+        temp_tables.extend(self.query2.temp_table_names[right_before:])
-        columns1, columns2 = _order_columns(q1.columns, q2.columns)
+        columns1 = _drop_system_columns(q1.columns)
+        columns2 = _drop_system_columns(q2.columns)
+        columns1, columns2 = _order_columns(columns1, columns2)
         def q(*columns):
-            names = {c.name for c in columns}
-            col1 = [c for c in columns1 if c.name in names]
-            col2 = [c for c in columns2 if c.name in names]
-            res = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
+            selected_names = [c.name for c in columns]
+            col1 = [c for c in columns1 if c.name in selected_names]
+            col2 = [c for c in columns2 if c.name in selected_names]
+            union_query = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
-            subquery = res.subquery()
-            return sqlalchemy.select(*subquery.c).select_from(subquery)
+            union_cte = union_query.cte()
+            select_cols = [union_cte.c[name] for name in selected_names]
+            return sqlalchemy.select(*select_cols)
         return step_result(
             q,
@@ -874,14 +1011,42 @@ class SQLJoin(Step):
     catalog: "Catalog"
     query1: "DatasetQuery"
     query2: "DatasetQuery"
-    predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
+    predicates: JoinPredicateType | tuple[JoinPredicateType, ...]
     inner: bool
     full: bool
     rname: str
+    @staticmethod
+    def _split_db_name(name: str) -> tuple[str, str]:
+        if DEFAULT_DELIMITER in name:
+            head, tail = name.split(DEFAULT_DELIMITER, 1)
+            return head, tail
+        return name, ""
+    @classmethod
+    def _root_name(cls, name: str) -> str:
+        return cls._split_db_name(name)[0]
+    def hash_inputs(self) -> str:
+        predicates = (
+            ensure_sequence(self.predicates) if self.predicates is not None else []
+        )
+        parts = [
+            bytes.fromhex(self.query1.hash()),
+            bytes.fromhex(self.query2.hash()),
+            bytes.fromhex(hash_column_elements(predicates)),
+            str(self.inner).encode(),
+            str(self.full).encode(),
+            self.rname.encode("utf-8"),
+        ]
+        return hashlib.sha256(b"".join(parts)).hexdigest()
     def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
+        temp_tables_before = len(dq.temp_table_names)
         query = dq.apply_steps().select()
-        temp_tables.extend(dq.temp_table_names)
+        temp_tables.extend(dq.temp_table_names[temp_tables_before:])
         if not any(isinstance(step, (SQLJoin, SQLUnion)) for step in dq.steps):
             return query.subquery(dq.table.name)
@@ -937,22 +1102,39 @@ class SQLJoin(Step):
         q1 = self.get_query(self.query1, temp_tables)
         q2 = self.get_query(self.query2, temp_tables)
-        q1_columns = list(q1.c)
-        q1_column_names = {c.name for c in q1_columns}
-        q2_columns = []
-        for c in q2.c:
-            if c.name.startswith("sys__"):
+        q1_columns = _drop_system_columns(q1.c)
+        existing_column_names = {c.name for c in q1_columns}
+        right_columns: list[KeyedColumnElement[Any]] = []
+        right_column_names: list[str] = []
+        for column in q2.c:
+            if column.name.startswith("sys__"):
                 continue
+            right_columns.append(column)
+            right_column_names.append(column.name)
+        root_mapping = generate_merge_root_mapping(
+            existing_column_names,
+            right_column_names,
+            extract_root=self._root_name,
+            prefix=self.rname,
+        )
+        q2_columns: list[KeyedColumnElement[Any]] = []
+        for column in right_columns:
+            original_name = column.name
+            column_root, column_tail = self._split_db_name(original_name)
+            mapped_root = root_mapping[column_root]
+            new_name = (
+                mapped_root
+                if not column_tail
+                else DEFAULT_DELIMITER.join([mapped_root, column_tail])
+            )
+            if new_name != original_name:
+                column = column.label(new_name)
-            if c.name in q1_column_names:
-                new_name = self.rname.format(name=c.name)
-                new_name_idx = 0
-                while new_name in q1_column_names:
-                    new_name_idx += 1
-                    new_name = self.rname.format(name=f"{c.name}_{new_name_idx}")
-                c = c.label(new_name)
-            q2_columns.append(c)
+            q2_columns.append(column)
         res_columns = q1_columns + q2_columns
         predicates = (
@@ -997,8 +1179,15 @@ class SQLJoin(Step):
 @frozen
 class SQLGroupBy(SQLClause):
-    cols: Sequence[Union[str, Function, ColumnElement]]
-    group_by: Sequence[Union[str, Function, ColumnElement]]
+    cols: Sequence[str | Function | ColumnElement]
+    group_by: Sequence[str | Function | ColumnElement]
+    def hash_inputs(self) -> str:
+        return hashlib.sha256(
+            bytes.fromhex(
+                hash_column_elements(self.cols) + hash_column_elements(self.group_by)
+            )
+        ).hexdigest()
     def apply_sql_clause(self, query) -> Select:
         if not self.cols:
@@ -1010,58 +1199,70 @@ class SQLGroupBy(SQLClause):
             c.get_column() if isinstance(c, Function) else c for c in self.group_by
         ]
-        cols = [
-            c.get_column()
-            if isinstance(c, Function)
-            else subquery.c[str(c)]
-            if isinstance(c, (str, C))
-            else c
-            for c in (*group_by, *self.cols)
-        ]
+        cols_dict: dict[str, Any] = {}
+        for c in (*group_by, *self.cols):
+            if isinstance(c, Function):
+                key = c.name
+                value = c.get_column()
+            elif isinstance(c, (str, C)):
+                key = str(c)
+                value = subquery.c[str(c)]
+            else:
+                key = c.name
+                value = c  # type: ignore[assignment]
+            cols_dict[key] = value
-        return sqlalchemy.select(*cols).select_from(subquery).group_by(*group_by)
+        unique_cols = cols_dict.values()
+        return sqlalchemy.select(*unique_cols).select_from(subquery).group_by(*group_by)
-def _validate_columns(
-    left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
-) -> set[str]:
-    left_names = {c.name for c in left_columns}
-    right_names = {c.name for c in right_columns}
-    if left_names == right_names:
-        return left_names
-    missing_right = left_names - right_names
-    missing_left = right_names - left_names
-    def _prepare_msg_part(missing_columns: set[str], side: str) -> str:
-        return f"{', '.join(sorted(missing_columns))} only present in {side}"
-    msg_parts = [
-        _prepare_msg_part(missing_columns, found_side)
-        for missing_columns, found_side in zip(
-            [
-                missing_right,
-                missing_left,
-            ],
-            ["left", "right"],
-        )
-        if missing_columns
-    ]
-    msg = f"Cannot perform union. {'. '.join(msg_parts)}"
-    raise ValueError(msg)
+class UnionSchemaMismatchError(ValueError):
+    """Union input columns mismatch."""
+    @classmethod
+    def from_column_sets(
+        cls,
+        missing_left: set[str],
+        missing_right: set[str],
+    ) -> "UnionSchemaMismatchError":
+        def _describe(cols: set[str], side: str) -> str:
+            return f"{', '.join(sorted(cols))} only present in {side}"
+        parts = []
+        if missing_left:
+            parts.append(_describe(missing_left, "left"))
+        if missing_right:
+            parts.append(_describe(missing_right, "right"))
+        return cls(f"Cannot perform union. {'. '.join(parts)}")
 def _order_columns(
     left_columns: Iterable[ColumnElement], right_columns: Iterable[ColumnElement]
 ) -> list[list[ColumnElement]]:
-    column_order = _validate_columns(left_columns, right_columns)
+    left_names = [c.name for c in left_columns]
+    right_names = [c.name for c in right_columns]
+    # validate
+    if sorted(left_names) != sorted(right_names):
+        left_names_set = set(left_names)
+        right_names_set = set(right_names)
+        raise UnionSchemaMismatchError.from_column_sets(
+            left_names_set - right_names_set,
+            right_names_set - left_names_set,
+        )
+    # Order columns to match left_names order
     column_dicts = [
         {c.name: c for c in columns} for columns in [left_columns, right_columns]
     ]
-    return [[d[n] for n in column_order] for d in column_dicts]
+    return [[d[n] for n in left_names] for d in column_dicts]
+def _drop_system_columns(columns: Iterable[ColumnElement]) -> list[ColumnElement]:
+    return [c for c in columns if not c.name.startswith("sys__")]
 @attrs.define
@@ -1077,62 +1278,71 @@ class DatasetQuery:
     def __init__(
         self,
         name: str,
-        version: Optional[int] = None,
-        catalog: Optional["Catalog"] = None,
-        session: Optional[Session] = None,
-        indexing_column_types: Optional[dict[str, Any]] = None,
+        version: str | None = None,
+        project_name: str | None = None,
+        namespace_name: str | None = None,
+        catalog: "Catalog | None" = None,
+        session: Session | None = None,
         in_memory: bool = False,
-        fallback_to_studio: bool = True,
         update: bool = False,
     ) -> None:
-        from datachain.remote.studio import is_token_set
         self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
         self.catalog = catalog or self.session.catalog
         self.steps: list[Step] = []
-        self._chunk_index: Optional[int] = None
-        self._chunk_total: Optional[int] = None
+        self._chunk_index: int | None = None
+        self._chunk_total: int | None = None
         self.temp_table_names: list[str] = []
         self.dependencies: set[DatasetDependencyType] = set()
         self.table = self.get_table()
-        self.starting_step: Optional[QueryStep] = None
-        self.name: Optional[str] = None
-        self.version: Optional[int] = None
-        self.feature_schema: Optional[dict] = None
-        self.column_types: Optional[dict[str, Any]] = None
+        self.starting_step: QueryStep | None = None
+        self.name: str | None = None
+        self.version: str | None = None
+        self.feature_schema: dict | None = None
+        self.column_types: dict[str, Any] | None = None
         self.before_steps: list[Callable] = []
-        self.listing_fn: Optional[Callable] = None
+        self.listing_fn: Callable | None = None
         self.update = update
-        self.list_ds_name: Optional[str] = None
+        self.list_ds_name: str | None = None
         self.name = name
         self.dialect = self.catalog.warehouse.db.dialect
         if version:
             self.version = version
-        if is_listing_dataset(name):
+        if namespace_name is None:
+            namespace_name = self.catalog.metastore.default_namespace_name
+        if project_name is None:
+            project_name = self.catalog.metastore.default_project_name
+        if is_listing_dataset(name) and not version:
             # not setting query step yet as listing dataset might not exist at
             # this point
             self.list_ds_name = name
-        elif fallback_to_studio and is_token_set():
+        else:
             self._set_starting_step(
-                self.catalog.get_dataset_with_remote_fallback(name, version)
+                self.catalog.get_dataset_with_remote_fallback(
+                    name,
+                    namespace_name=namespace_name,
+                    project_name=project_name,
+                    version=version,
+                    pull_dataset=True,
+                    update=update,
+                )
             )
-        else:
-            self._set_starting_step(self.catalog.get_dataset(name))
     def _set_starting_step(self, ds: "DatasetRecord") -> None:
         if not self.version:
             self.version = ds.latest_version
-        self.starting_step = QueryStep(self.catalog, ds.name, self.version)
+        self.starting_step = QueryStep(self.catalog, ds, self.version)
         # at this point we know our starting dataset so setting up schemas
         self.feature_schema = ds.get_version(self.version).feature_schema
         self.column_types = copy(ds.schema)
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
+        self.project = ds.project
     def __iter__(self):
         return iter(self.db_results())
@@ -1140,39 +1350,28 @@ class DatasetQuery:
     def __or__(self, other):
         return self.union(other)
-    def pull_dataset(self, name: str, version: Optional[int] = None) -> "DatasetRecord":
-        print("Dataset not found in local catalog, trying to get from studio")
-        remote_ds_uri = f"{DATASET_PREFIX}{name}"
-        if version:
-            remote_ds_uri += f"@v{version}"
+    def hash(self) -> str:
+        """
+        Calculates hash of this class taking into account hash of starting step
+        and hashes of each following steps. Ordering is important.
+        """
+        hasher = hashlib.sha256()
+        if self.starting_step:
+            hasher.update(self.starting_step.hash().encode("utf-8"))
+        else:
+            assert self.list_ds_name
+            hasher.update(self.list_ds_name.encode("utf-8"))
-        self.catalog.pull_dataset(
-            remote_ds_uri=remote_ds_uri,
-            local_ds_name=name,
-            local_ds_version=version,
-        )
+        for step in self.steps:
+            hasher.update(step.hash().encode("utf-8"))
-        return self.catalog.get_dataset(name)
+        return hasher.hexdigest()
     @staticmethod
     def get_table() -> "TableClause":
-        table_name = "".join(
-            random.choice(string.ascii_letters)  # noqa: S311
-            for _ in range(16)
-        )
+        table_name = "".join(secrets.choice(string.ascii_letters) for _ in range(16))
         return sqlalchemy.table(table_name)
-    @staticmethod
-    def delete(
-        name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
-    ) -> None:
-        from datachain.catalog import get_catalog
-        catalog = catalog or get_catalog()
-        version = version or catalog.get_dataset(name).latest_version
-        catalog.remove_dataset(name, version)
     @property
     def attached(self) -> bool:
         """
@@ -1180,14 +1379,14 @@ class DatasetQuery:
         it completely. If this is the case, name and version of underlying dataset
         will be defined.
         DatasetQuery instance can become attached in two scenarios:
-            1. ds = DatasetQuery(name="dogs", version=1) -> ds is attached to dogs
-            2. ds = ds.save("dogs", version=1) -> ds is attached to dogs dataset
+            1. ds = DatasetQuery(name="dogs", version="1.0.0") -> ds is attached to dogs
+            2. ds = ds.save("dogs", version="1.0.0") -> ds is attached to dogs dataset
         It can move to detached state if filter or similar methods are called on it,
         as then it no longer 100% represents underlying datasets.
         """
         return self.name is not None and self.version is not None
-    def c(self, column: Union[C, str]) -> "ColumnClause[Any]":
+    def c(self, column: C | str) -> "ColumnClause[Any]":
         col: sqlalchemy.ColumnClause = (
             sqlalchemy.column(column)
             if isinstance(column, str)
@@ -1200,11 +1399,8 @@ class DatasetQuery:
         """Setting listing function to be run if needed"""
         self.listing_fn = fn
-    def apply_steps(self) -> QueryGenerator:
-        """
-        Apply the steps in the query and return the resulting
-        sqlalchemy.SelectBase.
-        """
+    def apply_listing_pre_step(self) -> None:
+        """Runs listing pre-step if needed"""
         if self.list_ds_name and not self.starting_step:
             listing_ds = None
             try:
@@ -1220,6 +1416,13 @@ class DatasetQuery:
             # at this point we know what is our starting listing dataset name
             self._set_starting_step(listing_ds)  # type: ignore [arg-type]
+    def apply_steps(self) -> QueryGenerator:
+        """
+        Apply the steps in the query and return the resulting
+        sqlalchemy.SelectBase.
+        """
+        self.apply_listing_pre_step()
         query = self.clone()
         index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1278,6 +1481,7 @@ class DatasetQuery:
         # This is needed to always use a new connection with all metastore and warehouse
         # implementations, as errors may close or render unusable the existing
         # connections.
+        assert len(self.temp_table_names) == len(set(self.temp_table_names))
         with self.catalog.metastore.clone(use_new_connection=True) as metastore:
             metastore.cleanup_tables(self.temp_table_names)
         with self.catalog.warehouse.clone(use_new_connection=True) as warehouse:
@@ -1292,7 +1496,7 @@ class DatasetQuery:
             return list(result)
     def to_db_records(self) -> list[dict[str, Any]]:
-        return self.db_results(lambda cols, row: dict(zip(cols, row)))
+        return self.db_results(lambda cols, row: dict(zip(cols, row, strict=False)))
     @contextlib.contextmanager
     def as_iterable(self, **kwargs) -> Iterator[ResultIter]:
@@ -1331,8 +1535,8 @@ class DatasetQuery:
                         yield from rows
             async def get_params(row: Sequence) -> tuple:
-                row_dict = RowDict(zip(query_fields, row))
-                return tuple(
+                row_dict = RowDict(zip(query_fields, row, strict=False))
+                return tuple(  # noqa: C409
                     [
                         await p.get_value_async(
                             self.catalog, row_dict, mapper, **kwargs
@@ -1348,10 +1552,6 @@ class DatasetQuery:
         finally:
             self.cleanup()
-    def shuffle(self) -> "Self":
-        # ToDo: implement shaffle based on seed and/or generating random column
-        return self.order_by(C.sys__rand)
     def sample(self, n) -> "Self":
         """
         Return a random sample from the dataset.
@@ -1371,6 +1571,7 @@ class DatasetQuery:
         obj.steps = obj.steps.copy()
         if new_table:
             obj.table = self.get_table()
+        obj.temp_table_names = []
         return obj
     @detach
@@ -1441,7 +1642,7 @@ class DatasetQuery:
         return query
     @detach
-    def mutate(self, *args, **kwargs) -> "Self":
+    def mutate(self, *args, new_schema, **kwargs) -> "Self":
         """
         Add new columns to this query.
@@ -1453,7 +1654,7 @@ class DatasetQuery:
         """
         query_args = [v.label(k) for k, v in dict(args, **kwargs).items()]
         query = self.clone()
-        query.steps.append(SQLMutate((*query_args,)))
+        query.steps.append(SQLMutate((*query_args,), new_schema))
         return query
     @detach
@@ -1551,10 +1752,10 @@ class DatasetQuery:
     def join(
         self,
         dataset_query: "DatasetQuery",
-        predicates: Union[JoinPredicateType, Sequence[JoinPredicateType]],
+        predicates: JoinPredicateType | Sequence[JoinPredicateType],
         inner=False,
         full=False,
-        rname="{name}_right",
+        rname="right_",
     ) -> "Self":
         left = self.clone(new_table=False)
         if self.table.name == dataset_query.table.name:
@@ -1593,11 +1794,17 @@ class DatasetQuery:
     def add_signals(
         self,
         udf: "UDFAdapter",
-        parallel: Optional[int] = None,
-        workers: Union[bool, int] = False,
-        min_task_size: Optional[int] = None,
-        partition_by: Optional[PartitionByType] = None,
+        partition_by: PartitionByType | None = None,
+        # Parameters from Settings
         cache: bool = False,
+        parallel: int | None = None,
+        workers: bool | int = False,
+        min_task_size: int | None = None,
+        batch_size: int | None = None,
+        # Parameters are unused, kept only to match the signature of Settings.to_dict
+        prefetch: int | None = None,
+        namespace: str | None = None,
+        project: str | None = None,
     ) -> "Self":
         """
         Adds one or more signals based on the results from the provided UDF.
@@ -1623,6 +1830,7 @@ class DatasetQuery:
                 workers=workers,
                 min_task_size=min_task_size,
                 cache=cache,
+                batch_size=batch_size,
             )
         )
         return query
@@ -1637,11 +1845,17 @@ class DatasetQuery:
     def generate(
         self,
         udf: "UDFAdapter",
-        parallel: Optional[int] = None,
-        workers: Union[bool, int] = False,
-        min_task_size: Optional[int] = None,
-        partition_by: Optional[PartitionByType] = None,
+        partition_by: PartitionByType | None = None,
+        # Parameters from Settings
         cache: bool = False,
+        parallel: int | None = None,
+        workers: bool | int = False,
+        min_task_size: int | None = None,
+        batch_size: int | None = None,
+        # Parameters are unused, kept only to match the signature of Settings.to_dict:
+        prefetch: int | None = None,
+        namespace: str | None = None,
+        project: str | None = None,
     ) -> "Self":
         query = self.clone()
         steps = query.steps
@@ -1654,41 +1868,84 @@ class DatasetQuery:
                 workers=workers,
                 min_task_size=min_task_size,
                 cache=cache,
+                batch_size=batch_size,
             )
         )
         return query
-    def _add_dependencies(self, dataset: "DatasetRecord", version: int):
-        for dependency in self.dependencies:
-            ds_dependency_name, ds_dependency_version = dependency
+    def _add_dependencies(self, dataset: "DatasetRecord", version: str):
+        dependencies: set[DatasetDependencyType] = set()
+        for dep_dataset, dep_dataset_version in self.dependencies:
+            if Session.is_temp_dataset(dep_dataset.name):
+                # temp dataset are created for optimization and they will be removed
+                # afterwards. Therefore, we should not put them as dependencies, but
+                # their own direct dependencies
+                for dep in self.catalog.get_dataset_dependencies(
+                    dep_dataset.name,
+                    dep_dataset_version,
+                    namespace_name=dep_dataset.project.namespace.name,
+                    project_name=dep_dataset.project.name,
+                    indirect=False,
+                ):
+                    if dep:
+                        dependencies.add(
+                            (
+                                self.catalog.get_dataset(
+                                    dep.name,
+                                    namespace_name=dep.namespace,
+                                    project_name=dep.project,
+                                ),
+                                dep.version,
+                            )
+                        )
+            else:
+                dependencies.add((dep_dataset, dep_dataset_version))
+        for dep_dataset, dep_dataset_version in dependencies:
             self.catalog.metastore.add_dataset_dependency(
-                dataset.name,
+                dataset,
                 version,
-                ds_dependency_name,
-                ds_dependency_version,
+                dep_dataset,
+                dep_dataset_version,
             )
     def exec(self) -> "Self":
         """Execute the query."""
+        query = self.clone()
         try:
-            query = self.clone()
             query.apply_steps()
         finally:
-            self.cleanup()
+            query.cleanup()
         return query
     def save(
         self,
-        name: Optional[str] = None,
-        version: Optional[int] = None,
-        feature_schema: Optional[dict] = None,
-        description: Optional[str] = None,
-        labels: Optional[list[str]] = None,
+        name: str | None = None,
+        version: str | None = None,
+        project: Project | None = None,
+        feature_schema: dict | None = None,
+        dependencies: list[DatasetDependency] | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
+        update_version: str | None = "patch",
         **kwargs,
     ) -> "Self":
         """Save the query as a dataset."""
+        # Get job from session to link dataset version to job
+        job = self.session.get_or_create_job()
+        job_id = job.id
+        project = project or self.catalog.metastore.default_project
         try:
-            if name and version and self.catalog.get_dataset(name).has_version(version):
+            if (
+                name
+                and version
+                and self.catalog.get_dataset(
+                    name,
+                    namespace_name=project.namespace.name,
+                    project_name=project.name,
+                ).has_version(version)
+            ):
                 raise RuntimeError(f"Dataset {name} already has version {version}")
         except DatasetNotFoundError:
             pass
@@ -1713,19 +1970,18 @@ class DatasetQuery:
             dataset = self.catalog.create_dataset(
                 name,
+                project,
                 version=version,
                 feature_schema=feature_schema,
                 columns=columns,
                 description=description,
-                labels=labels,
+                attrs=attrs,
+                update_version=update_version,
+                job_id=job_id,
                 **kwargs,
             )
             version = version or dataset.latest_version
-            self.session.add_dataset_version(
-                dataset=dataset, version=version, listing=kwargs.get("listing", False)
-            )
             dr = self.catalog.warehouse.dataset_rows(dataset)
             self.catalog.warehouse.copy_table(dr.get_table(), query.select())
@@ -1735,15 +1991,41 @@ class DatasetQuery:
             )
             self.catalog.update_dataset_version_with_warehouse_info(dataset, version)
+            # Link this dataset version to the job that created it
+            self.catalog.metastore.link_dataset_version_to_job(
+                dataset.get_version(version).id, job_id, is_creator=True
+            )
+            if dependencies:
+                # overriding dependencies
+                self.dependencies = set()
+                for dep in dependencies:
+                    self.dependencies.add(
+                        (
+                            self.catalog.get_dataset(
+                                dep.name,
+                                namespace_name=dep.namespace,
+                                project_name=dep.project,
+                            ),
+                            dep.version,
+                        )
+                    )
             self._add_dependencies(dataset, version)  # type: ignore [arg-type]
         finally:
             self.cleanup()
-        return self.__class__(name=name, version=version, catalog=self.catalog)
+        return self.__class__(
+            name=name,
+            namespace_name=project.namespace.name,
+            project_name=project.name,
+            version=version,
+            catalog=self.catalog,
+        )
     @property
     def is_ordered(self) -> bool:
         return isinstance(self.last_step, SQLOrderBy)
     @property
-    def last_step(self) -> Optional[Step]:
+    def last_step(self) -> Step | None:
         return self.steps[-1] if self.steps else None

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl