PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -1,37 +1,40 @@
 import copy
+import hashlib
+import logging
 import os
 import os.path
 import sys
 import warnings
-from collections.abc import Iterator, Sequence
+from collections.abc import Callable, Iterator, Sequence
 from typing import (
     IO,
     TYPE_CHECKING,
     Any,
     BinaryIO,
-    Callable,
     ClassVar,
     Literal,
-    Optional,
     TypeVar,
-    Union,
     cast,
     overload,
 )
 import sqlalchemy
-import ujson as json
 from pydantic import BaseModel
 from sqlalchemy.sql.elements import ColumnElement
 from tqdm import tqdm
-from datachain import semver
+from datachain import json, semver
 from datachain.dataset import DatasetRecord
 from datachain.delta import delta_disabled
-from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
+from datachain.error import (
+    JobAncestryDepthExceededError,
+    ProjectCreateNotAllowedError,
+    ProjectNotFoundError,
+)
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
+from datachain.job import Job
 from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.data_model import (
     DataModel,
@@ -40,11 +43,7 @@ from datachain.lib.data_model import (
     StandardType,
     dict_to_data_model,
 )
-from datachain.lib.file import (
-    EXPORT_FILES_MAX_THREADS,
-    ArrowRow,
-    FileExporter,
-)
+from datachain.lib.file import EXPORT_FILES_MAX_THREADS, ArrowRow, File, FileExporter
 from datachain.lib.file import ExportPlacement as FileExportPlacement
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
@@ -52,11 +51,17 @@ from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
 from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
 from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import DataChainColumnError, DataChainParamsError
+from datachain.project import Project
 from datachain.query import Session
-from datachain.query.dataset import DatasetQuery, PartitionByType
+from datachain.query.dataset import (
+    DatasetQuery,
+    PartitionByType,
+    RegenerateSystemColumns,
+    UnionSchemaMismatchError,
+)
 from datachain.query.schema import DEFAULT_DELIMITER, Column
 from datachain.sql.functions import path as pathfunc
-from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
+from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
 from .database import DEFAULT_DATABASE_BATCH_SIZE
 from .utils import (
@@ -71,6 +76,8 @@ from .utils import (
     resolve_columns,
 )
+logger = logging.getLogger("datachain")
 C = Column
 _T = TypeVar("_T")
@@ -82,19 +89,20 @@ if TYPE_CHECKING:
     import sqlite3
     import pandas as pd
+    from sqlalchemy.orm import Session as OrmSession
     from typing_extensions import ParamSpec, Self
     P = ParamSpec("P")
-    ConnectionType = Union[
-        str,
-        sqlalchemy.engine.URL,
-        sqlalchemy.engine.interfaces.Connectable,
-        sqlalchemy.engine.Engine,
-        sqlalchemy.engine.Connection,
-        "sqlalchemy.orm.Session",
-        sqlite3.Connection,
-    ]
+    ConnectionType = (
+        str
+        | sqlalchemy.engine.URL
+        | sqlalchemy.engine.interfaces.Connectable
+        | sqlalchemy.engine.Engine
+        | sqlalchemy.engine.Connection
+        | OrmSession
+        | sqlite3.Connection
+    )
 T = TypeVar("T", bound="DataChain")
@@ -183,7 +191,7 @@ class DataChain:
         query: DatasetQuery,
         settings: Settings,
         signal_schema: SignalSchema,
-        setup: Optional[dict] = None,
+        setup: dict | None = None,
         _sys: bool = False,
     ) -> None:
         """Don't instantiate this directly, use one of the from_XXX constructors."""
@@ -193,10 +201,11 @@ class DataChain:
         self._setup: dict = setup or {}
         self._sys = _sys
         self._delta = False
-        self._delta_on: Optional[Union[str, Sequence[str]]] = None
-        self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
-        self._delta_compare: Optional[Union[str, Sequence[str]]] = None
-        self._delta_retry: Optional[Union[bool, str]] = None
+        self._delta_unsafe = False
+        self._delta_on: str | Sequence[str] | None = None
+        self._delta_result_on: str | Sequence[str] | None = None
+        self._delta_compare: str | Sequence[str] | None = None
+        self._delta_retry: bool | str | None = None
     def __repr__(self) -> str:
         """Return a string representation of the chain."""
@@ -210,12 +219,21 @@ class DataChain:
         self.print_schema(file=file)
         return file.getvalue()
+    def hash(self) -> str:
+        """
+        Calculates SHA hash of this chain. Hash calculation is fast and consistent.
+        It takes into account all the steps added to the chain and their inputs.
+        Order of the steps is important.
+        """
+        return self._query.hash()
     def _as_delta(
         self,
-        on: Optional[Union[str, Sequence[str]]] = None,
-        right_on: Optional[Union[str, Sequence[str]]] = None,
-        compare: Optional[Union[str, Sequence[str]]] = None,
-        delta_retry: Optional[Union[bool, str]] = None,
+        on: str | Sequence[str] | None = None,
+        right_on: str | Sequence[str] | None = None,
+        compare: str | Sequence[str] | None = None,
+        delta_retry: bool | str | None = None,
+        delta_unsafe: bool = False,
     ) -> "Self":
         """Marks this chain as delta, which means special delta process will be
         called on saving dataset for optimization"""
@@ -226,6 +244,7 @@ class DataChain:
         self._delta_result_on = right_on
         self._delta_compare = compare
         self._delta_retry = delta_retry
+        self._delta_unsafe = delta_unsafe
         return self
     @property
@@ -238,6 +257,10 @@ class DataChain:
         """Returns True if this chain is ran in "delta" update mode"""
         return self._delta
+    @property
+    def delta_unsafe(self) -> bool:
+        return self._delta_unsafe
     @property
     def schema(self) -> dict[str, DataType]:
         """Get schema of the chain."""
@@ -259,7 +282,7 @@ class DataChain:
         raise ValueError(f"Column with name {name} not found in the schema")
-    def c(self, column: Union[str, Column]) -> Column:
+    def c(self, column: str | Column) -> Column:
         """Returns Column instance attached to the current chain."""
         c = self.column(column) if isinstance(column, str) else self.column(column.name)
         c.table = self._query.table
@@ -271,17 +294,17 @@ class DataChain:
         return self._query.session
     @property
-    def name(self) -> Optional[str]:
+    def name(self) -> str | None:
         """Name of the underlying dataset, if there is one."""
         return self._query.name
     @property
-    def version(self) -> Optional[str]:
+    def version(self) -> str | None:
         """Version of the underlying dataset, if there is one."""
         return self._query.version
     @property
-    def dataset(self) -> Optional[DatasetRecord]:
+    def dataset(self) -> DatasetRecord | None:
         """Underlying dataset, if there is one."""
         if not self.name:
             return None
@@ -295,7 +318,7 @@ class DataChain:
         """Return `self.union(other)`."""
         return self.union(other)
-    def print_schema(self, file: Optional[IO] = None) -> None:
+    def print_schema(self, file: IO | None = None) -> None:
         """Print schema of the chain."""
         self._effective_signals_schema.print_tree(file=file)
@@ -306,8 +329,8 @@ class DataChain:
     def _evolve(
         self,
         *,
-        query: Optional[DatasetQuery] = None,
-        settings: Optional[Settings] = None,
+        query: DatasetQuery | None = None,
+        settings: Settings | None = None,
         signal_schema=None,
         _sys=None,
     ) -> "Self":
@@ -328,46 +351,51 @@ class DataChain:
                 right_on=self._delta_result_on,
                 compare=self._delta_compare,
                 delta_retry=self._delta_retry,
+                delta_unsafe=self._delta_unsafe,
             )
         return chain
     def settings(
         self,
-        cache=None,
-        parallel=None,
-        workers=None,
-        min_task_size=None,
-        prefetch: Optional[int] = None,
-        sys: Optional[bool] = None,
-        namespace: Optional[str] = None,
-        project: Optional[str] = None,
-        batch_rows: Optional[int] = None,
+        cache: bool | None = None,
+        prefetch: bool | int | None = None,
+        parallel: bool | int | None = None,
+        workers: int | None = None,
+        namespace: str | None = None,
+        project: str | None = None,
+        min_task_size: int | None = None,
+        batch_size: int | None = None,
+        sys: bool | None = None,
     ) -> "Self":
-        """Change settings for chain.
-        This function changes specified settings without changing not specified ones.
-        It returns chain, so, it can be chained later with next operation.
+        """
+        Set chain execution parameters. Returns the chain itself, allowing method
+        chaining for subsequent operations. To restore all settings to their default
+        values, use `reset_settings()`.
         Parameters:
-            cache : data caching. (default=False)
-            parallel : number of thread for processors. True is a special value to
-                enable all available CPUs. (default=1)
-            workers : number of distributed workers. Only for Studio mode. (default=1)
-            min_task_size : minimum number of tasks. (default=1)
-            prefetch : number of workers to use for downloading files in advance.
-                      This is enabled by default and uses 2 workers.
-                      To disable prefetching, set it to 0.
-            namespace : namespace name.
-            project : project name.
-            batch_rows : row limit per insert to balance speed and memory usage.
-                      (default=2000)
+            cache: Enable files caching to speed up subsequent accesses to the same
+                files from the same or different chains. Defaults to False.
+            prefetch: Enable prefetching of files. This will download files in
+                advance in parallel. If an integer is provided, it specifies the number
+                of files to prefetch concurrently for each process on each worker.
+                Defaults to 2. Set to 0 or False to disable prefetching.
+            parallel: Number of processes to use for processing user-defined functions
+                (UDFs) in parallel. If an integer is provided, it specifies the number
+                of CPUs to use. If True, all available CPUs are used. Defaults to 1.
+            namespace: Namespace to use for the chain by default.
+            project: Project to use for the chain by default.
+            min_task_size: Minimum number of rows per worker/process for parallel
+                processing by UDFs. Defaults to 1.
+            batch_size: Number of rows per insert by UDF to fine tune and balance speed
+                and memory usage. This might be useful when processing large rows
+                or when running into memory issues. Defaults to 2000.
         Example:
             ```py
             chain = (
                 chain
-                .settings(cache=True, parallel=8, batch_rows=300)
+                .settings(cache=True, parallel=8, batch_size=300)
                 .map(laion=process_webdataset(spec=WDSLaion), params="file")
             )
             ```
@@ -377,20 +405,20 @@ class DataChain:
         settings = copy.copy(self._settings)
         settings.add(
             Settings(
-                cache,
-                parallel,
-                workers,
-                min_task_size,
-                prefetch,
-                namespace,
-                project,
-                batch_rows,
+                cache=cache,
+                prefetch=prefetch,
+                parallel=parallel,
+                workers=workers,
+                namespace=namespace,
+                project=project,
+                min_task_size=min_task_size,
+                batch_size=batch_size,
             )
         )
         return self._evolve(settings=settings, _sys=sys)
-    def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
-        """Reset all settings to default values."""
+    def reset_settings(self, settings: Settings | None = None) -> "Self":
+        """Reset all chain settings to default values."""
         self._settings = settings if settings else Settings()
         return self
@@ -441,8 +469,8 @@ class DataChain:
     def explode(
         self,
         col: str,
-        model_name: Optional[str] = None,
-        column: Optional[str] = None,
+        model_name: str | None = None,
+        column: str | None = None,
         schema_sample_size: int = 1,
     ) -> "DataChain":
         """Explodes a column containing JSON objects (dict or str DataChain type) into
@@ -483,7 +511,7 @@ class DataChain:
         model = dict_to_data_model(model_name, output, original_names)
-        def json_to_model(json_value: Union[str, dict]):
+        def json_to_model(json_value: str | dict):
             json_dict = (
                 json.loads(json_value) if isinstance(json_value, str) else json_value
             )
@@ -557,116 +585,258 @@ class DataChain:
             create=True,
         )
         return self._evolve(
-            query=self._query.save(project=project, feature_schema=schema)
+            query=self._query.save(project=project, feature_schema=schema),
+            signal_schema=self.signals_schema | SignalSchema({"sys": Sys}),
         )
+    def _calculate_job_hash(self, job_id: str) -> str:
+        """
+        Calculates hash of the job at the place of this chain's save method.
+        Hash is calculated using previous job checkpoint hash (if exists) and
+        adding hash of this chain to produce new hash.
+        """
+        last_checkpoint = self.session.catalog.metastore.get_last_checkpoint(job_id)
+        return hashlib.sha256(
+            (bytes.fromhex(last_checkpoint.hash) if last_checkpoint else b"")
+            + bytes.fromhex(self.hash())
+        ).hexdigest()
     def save(  # type: ignore[override]
         self,
         name: str,
-        version: Optional[str] = None,
-        description: Optional[str] = None,
-        attrs: Optional[list[str]] = None,
-        update_version: Optional[str] = "patch",
+        version: str | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
+        update_version: str | None = "patch",
         **kwargs,
     ) -> "DataChain":
         """Save to a Dataset. It returns the chain itself.
         Parameters:
-            name : dataset name. It can be full name consisting of namespace and
-                project, but it can also be just a regular dataset name in which
-                case we are taking namespace and project from settings, if they
-                are defined there, or default ones instead.
-            version : version of a dataset. If version is not specified and dataset
+            name: dataset name. This can be either a fully qualified name, including
+                the namespace and project, or just a regular dataset name. In the latter
+                case, the namespace and project will be taken from the settings
+                (if specified) or from the default values otherwise.
+            version: version of a dataset. If version is not specified and dataset
                 already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
-            description : description of a dataset.
-            attrs : attributes of a dataset. They can be without value, e.g "NLP",
+            description: description of a dataset.
+            attrs: attributes of a dataset. They can be without value, e.g "NLP",
                 or with a value, e.g "location=US".
             update_version: which part of the dataset version to automatically increase.
                 Available values: `major`, `minor` or `patch`. Default is `patch`.
         """
         catalog = self.session.catalog
-        if version is not None:
-            semver.validate(version)
-        if update_version is not None and update_version not in [
-            "patch",
-            "major",
-            "minor",
-        ]:
-            raise ValueError(
-                "update_version can have one of the following values: major, minor or"
-                " patch"
-            )
+        result = None  # result chain that will be returned at the end
+        # Version validation
+        self._validate_version(version)
+        self._validate_update_version(update_version)
+        # get existing job if running in SaaS, or creating new one if running locally
+        job = self.session.get_or_create_job()
         namespace_name, project_name, name = catalog.get_full_dataset_name(
             name,
             namespace_name=self._settings.namespace,
             project_name=self._settings.project,
         )
+        project = self._get_or_create_project(namespace_name, project_name)
+        # Checkpoint handling
+        _hash, result = self._resolve_checkpoint(name, project, job, kwargs)
+        if bool(result):
+            # Checkpoint was found and reused
+            print(f"Checkpoint found for dataset '{name}', skipping creation")
+        # Schema preparation
+        schema = self.signals_schema.clone_without_sys_signals().serialize()
+        # Handle retry and delta functionality
+        if not result:
+            result = self._handle_delta(name, version, project, schema, kwargs)
+        if not result:
+            # calculate chain if we already don't have result from checkpoint or delta
+            result = self._evolve(
+                query=self._query.save(
+                    name=name,
+                    version=version,
+                    project=project,
+                    description=description,
+                    attrs=attrs,
+                    feature_schema=schema,
+                    update_version=update_version,
+                    **kwargs,
+                )
+            )
+        catalog.metastore.create_checkpoint(job.id, _hash)  # type: ignore[arg-type]
+        return result
+    def _validate_version(self, version: str | None) -> None:
+        """Validate dataset version if provided."""
+        if version is not None:
+            semver.validate(version)
+    def _validate_update_version(self, update_version: str | None) -> None:
+        """Ensure update_version is one of: major, minor, patch."""
+        allowed = ["major", "minor", "patch"]
+        if update_version not in allowed:
+            raise ValueError(f"update_version must be one of {allowed}")
+    def _get_or_create_project(self, namespace: str, project_name: str) -> Project:
+        """Get project or raise if creation not allowed."""
         try:
-            project = self.session.catalog.metastore.get_project(
+            return self.session.catalog.metastore.get_project(
                 project_name,
-                namespace_name,
+                namespace,
                 create=is_studio(),
             )
         except ProjectNotFoundError as e:
-            # not being able to create it as creation is not allowed
             raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
-        schema = self.signals_schema.clone_without_sys_signals().serialize()
+    def _resolve_checkpoint(
+        self,
+        name: str,
+        project: Project,
+        job: Job,
+        kwargs: dict,
+    ) -> tuple[str, "DataChain | None"]:
+        """Check if checkpoint exists and return cached dataset if possible."""
+        from .datasets import read_dataset
-        # Handle retry and delta functionality
-        if self.delta and name:
-            from datachain.delta import delta_retry_update
+        metastore = self.session.catalog.metastore
+        checkpoints_reset = env2bool("DATACHAIN_CHECKPOINTS_RESET", undefined=True)
-            # Delta chains must have delta_on defined (ensured by _as_delta method)
-            assert self._delta_on is not None, "Delta chain must have delta_on defined"
+        _hash = self._calculate_job_hash(job.id)
-            result_ds, dependencies, has_changes = delta_retry_update(
-                self,
-                namespace_name,
-                project_name,
+        if (
+            job.parent_job_id
+            and not checkpoints_reset
+            and metastore.find_checkpoint(job.parent_job_id, _hash)
+        ):
+            # checkpoint found → find which dataset version to reuse
+            # Find dataset version that was created by any ancestor job
+            try:
+                dataset_version = metastore.get_dataset_version_for_job_ancestry(
+                    name,
+                    project.namespace.name,
+                    project.name,
+                    job.id,
+                )
+            except JobAncestryDepthExceededError:
+                raise JobAncestryDepthExceededError(
+                    "Job continuation chain is too deep. "
+                    "Please run the job from scratch without continuing from a "
+                    "parent job."
+                ) from None
+            if not dataset_version:
+                logger.debug(
+                    "Checkpoint found but no dataset version for '%s' "
+                    "in job ancestry (job_id=%s). Creating new version.",
+                    name,
+                    job.id,
+                )
+                # Dataset version not found (e.g deleted by user) - skip
+                # checkpoint and recreate
+                return _hash, None
+            logger.debug(
+                "Reusing dataset version '%s' v%s from job ancestry "
+                "(job_id=%s, dataset_version_id=%s)",
                 name,
-                on=self._delta_on,
-                right_on=self._delta_result_on,
-                compare=self._delta_compare,
-                delta_retry=self._delta_retry,
+                dataset_version.version,
+                job.id,
+                dataset_version.id,
             )
-            if result_ds:
-                return self._evolve(
-                    query=result_ds._query.save(
-                        name=name,
-                        version=version,
-                        project=project,
-                        feature_schema=schema,
-                        dependencies=dependencies,
-                        **kwargs,
-                    )
-                )
+            # Read the specific version from ancestry
+            chain = read_dataset(
+                name,
+                namespace=project.namespace.name,
+                project=project.name,
+                version=dataset_version.version,
+                **kwargs,
+            )
-            if not has_changes:
-                # sources have not been changed so new version of resulting dataset
-                # would be the same as previous one. To avoid duplicating exact
-                # datasets, we won't create new version of it and we will return
-                # current latest version instead.
-                from .datasets import read_dataset
+            # Link current job to this dataset version (not creator).
+            # This also updates dataset_version.job_id.
+            metastore.link_dataset_version_to_job(
+                dataset_version.id,
+                job.id,
+                is_creator=False,
+            )
-                return read_dataset(name, **kwargs)
+            return _hash, chain
-        return self._evolve(
-            query=self._query.save(
-                name=name,
-                version=version,
-                project=project,
-                description=description,
-                attrs=attrs,
-                feature_schema=schema,
-                update_version=update_version,
+        return _hash, None
+    def _handle_delta(
+        self,
+        name: str,
+        version: str | None,
+        project: Project,
+        schema: dict,
+        kwargs: dict,
+    ) -> "DataChain | None":
+        """Try to save as a delta dataset.
+        Returns:
+            A DataChain if delta logic could handle it, otherwise None to fall back
+            to the regular save path (e.g., on first dataset creation).
+        """
+        from datachain.delta import delta_retry_update
+        from .datasets import read_dataset
+        if not self.delta or not name:
+            return None
+        assert self._delta_on is not None, "Delta chain must have delta_on defined"
+        result_ds, dependencies, has_changes = delta_retry_update(
+            self,
+            project.namespace.name,
+            project.name,
+            name,
+            on=self._delta_on,
+            right_on=self._delta_result_on,
+            compare=self._delta_compare,
+            delta_retry=self._delta_retry,
+        )
+        # Case 1: delta produced a new dataset
+        if result_ds:
+            return self._evolve(
+                query=result_ds._query.save(
+                    name=name,
+                    version=version,
+                    project=project,
+                    feature_schema=schema,
+                    dependencies=dependencies,
+                    **kwargs,
+                )
+            )
+        # Case 2: no changes → reuse last version
+        if not has_changes:
+            # sources have not been changed so new version of resulting dataset
+            # would be the same as previous one. To avoid duplicating exact
+            # datasets, we won't create new version of it and we will return
+            # current latest version instead.
+            return read_dataset(
+                name,
+                namespace=project.namespace.name,
+                project=project.name,
                 **kwargs,
             )
-        )
+        # Case 3: first creation of dataset
+        return None
     def apply(self, func, *args, **kwargs):
         """Apply any function to the chain.
@@ -693,10 +863,10 @@ class DataChain:
     def map(
         self,
-        func: Optional[Callable] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
-        **signal_map,
+        **signal_map: Any,
     ) -> "Self":
         """Apply a function to each row to create new signals. The function should
         return a new object for each row. It returns a chain itself with new signals.
@@ -704,17 +874,17 @@ class DataChain:
         Input-output relationship: 1:1
         Parameters:
-            func : Function applied to each row.
-            params : List of column names used as input for the function. Default
+            func: Function applied to each row.
+            params: List of column names used as input for the function. Default
                     is taken from function signature.
-            output : Dictionary defining new signals and their corresponding types.
+            output: Dictionary defining new signals and their corresponding types.
                     Default type is taken from function signature. Default can be also
                     taken from kwargs - **signal_map (see below).
                     If signal name is defined using signal_map (see below) only a single
                     type value can be used.
-            **signal_map : kwargs can be used to define `func` together with it's return
+            **signal_map: kwargs can be used to define `func` together with its return
                     signal name in format of `map(my_sign=my_func)`. This helps define
-                    signal names and function in a nicer way.
+                    signal names and functions in a nicer way.
         Example:
             Using signal_map and single type in output:
@@ -735,18 +905,19 @@ class DataChain:
         if (prefetch := self._settings.prefetch) is not None:
             udf_obj.prefetch = prefetch
+        sys_schema = SignalSchema({"sys": Sys})
         return self._evolve(
             query=self._query.add_signals(
-                udf_obj.to_udf_wrapper(self._settings.batch_rows),
+                udf_obj.to_udf_wrapper(self._settings.batch_size),
                 **self._settings.to_dict(),
             ),
-            signal_schema=self.signals_schema | udf_obj.output,
+            signal_schema=sys_schema | self.signals_schema | udf_obj.output,
         )
     def gen(
         self,
-        func: Optional[Union[Callable, Generator]] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | Generator | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
         **signal_map,
     ) -> "Self":
@@ -775,19 +946,19 @@ class DataChain:
             udf_obj.prefetch = prefetch
         return self._evolve(
             query=self._query.generate(
-                udf_obj.to_udf_wrapper(self._settings.batch_rows),
+                udf_obj.to_udf_wrapper(self._settings.batch_size),
                 **self._settings.to_dict(),
             ),
-            signal_schema=udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
         )
     @delta_disabled
     def agg(
         self,
         /,
-        func: Optional[Callable] = None,
-        partition_by: Optional[PartitionByType] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | None = None,
+        partition_by: PartitionByType | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
         **signal_map: Callable,
     ) -> "Self":
@@ -911,17 +1082,17 @@ class DataChain:
         udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
         return self._evolve(
             query=self._query.generate(
-                udf_obj.to_udf_wrapper(self._settings.batch_rows),
+                udf_obj.to_udf_wrapper(self._settings.batch_size),
                 partition_by=processed_partition_by,
                 **self._settings.to_dict(),
             ),
-            signal_schema=udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
         )
     def batch_map(
         self,
-        func: Optional[Callable] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
         batch: int = 1000,
         **signal_map,
@@ -933,7 +1104,7 @@ class DataChain:
         It accepts the same parameters plus an
         additional parameter:
-            batch : Size of each batch passed to `func`. Defaults to 1000.
+            batch: Size of each batch passed to `func`. Defaults to 1000.
         Example:
             ```py
@@ -960,7 +1131,7 @@ class DataChain:
         return self._evolve(
             query=self._query.add_signals(
-                udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
+                udf_obj.to_udf_wrapper(self._settings.batch_size, batch=batch),
                 **self._settings.to_dict(),
             ),
             signal_schema=self.signals_schema | udf_obj.output,
@@ -969,8 +1140,8 @@ class DataChain:
     def _udf_to_obj(
         self,
         target_class: type[UDFObjT],
-        func: Optional[Union[Callable, UDFObjT]],
-        params: Union[None, str, Sequence[str]],
+        func: Callable | UDFObjT | None,
+        params: str | Sequence[str] | None,
         output: OutputType,
         signal_map: dict[str, Callable],
     ) -> UDFObjT:
@@ -981,11 +1152,7 @@ class DataChain:
         sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
         DataModel.register(list(sign.output_schema.values.values()))
-        signals_schema = self.signals_schema
-        if self._sys:
-            signals_schema = SignalSchema({"sys": Sys}) | signals_schema
-        params_schema = signals_schema.slice(
+        params_schema = self.signals_schema.slice(
             sign.params, self._setup, is_batch=is_batch
         )
@@ -1016,7 +1183,8 @@ class DataChain:
             the order of the records in the chain is important.
             Using `order_by` directly before `limit`, `to_list` and similar methods
             will give expected results.
-            See https://github.com/iterative/datachain/issues/477 for further details.
+            See https://github.com/datachain-ai/datachain/issues/477
+            for further details.
         """
         if descending:
             args = tuple(sqlalchemy.desc(a) for a in args)
@@ -1040,11 +1208,9 @@ class DataChain:
             )
         )
-    def select(self, *args: str, _sys: bool = True) -> "Self":
+    def select(self, *args: str) -> "Self":
         """Select only a specified set of signals."""
         new_schema = self.signals_schema.resolve(*args)
-        if self._sys and _sys:
-            new_schema = SignalSchema({"sys": Sys}) | new_schema
         columns = new_schema.db_signals()
         return self._evolve(
             query=self._query.select(*columns), signal_schema=new_schema
@@ -1062,7 +1228,7 @@ class DataChain:
     def group_by(  # noqa: C901, PLR0912
         self,
         *,
-        partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
+        partition_by: str | Func | Sequence[str | Func] | None = None,
         **kwargs: Func,
     ) -> "Self":
         """Group rows by specified set of signals and return new signals
@@ -1301,9 +1467,9 @@ class DataChain:
         """Yields flattened rows of values as a tuple.
         Args:
-            row_factory : A callable to convert row to a custom format.
-                          It should accept two arguments: a list of column names and
-                          a tuple of row values.
+            row_factory: A callable to convert row to a custom format.
+                It should accept two arguments: a list of column names and
+                a tuple of row values.
             include_hidden: Whether to include hidden signals from the schema.
         """
         db_signals = self._effective_signals_schema.db_signals(
@@ -1368,7 +1534,7 @@ class DataChain:
         """Convert every row to a dictionary."""
         def to_dict(cols: list[str], row: tuple[Any, ...]) -> dict[str, Any]:
-            return dict(zip(cols, row))
+            return dict(zip(cols, row, strict=False))
         return self.results(row_factory=to_dict)
@@ -1426,7 +1592,7 @@ class DataChain:
     @overload
     def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
-    def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]:  # type: ignore[overload-overlap,misc]
+    def collect(self, *cols: str) -> Iterator[DataValue | tuple[DataValue, ...]]:  # type: ignore[overload-overlap,misc]
         """
         Deprecated. Use `to_iter` method instead.
         """
@@ -1491,8 +1657,8 @@ class DataChain:
     def merge(
         self,
         right_ds: "DataChain",
-        on: Union[MergeColType, Sequence[MergeColType]],
-        right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
+        on: MergeColType | Sequence[MergeColType],
+        right_on: MergeColType | Sequence[MergeColType] | None = None,
         inner=False,
         full=False,
         rname="right_",
@@ -1560,8 +1726,8 @@ class DataChain:
         def _resolve(
             ds: DataChain,
-            col: Union[str, Function, sqlalchemy.ColumnElement],
-            side: Union[str, None],
+            col: str | Function | sqlalchemy.ColumnElement,
+            side: str | None,
         ):
             try:
                 if isinstance(col, Function):
@@ -1574,7 +1740,7 @@ class DataChain:
         ops = [
             _resolve(self, left, "left")
             == _resolve(right_ds, right, "right" if right_on else None)
-            for left, right in zip(on, right_on or on)
+            for left, right in zip(on, right_on or on, strict=False)
         ]
         if errors:
@@ -1583,16 +1749,17 @@ class DataChain:
             )
         query = self._query.join(
-            right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
+            right_ds._query, sqlalchemy.and_(*ops), inner, full, rname
         )
         query.feature_schema = None
         ds = self._evolve(query=query)
+        # Note: merge drops sys signals from both sides, make sure to not include it
+        # in the resulting schema
         signals_schema = self.signals_schema.clone_without_sys_signals()
         right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
-        ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
-            right_signals_schema, rname
-        )
+        ds.signals_schema = signals_schema.merge(right_signals_schema, rname)
         return ds
@@ -1603,13 +1770,23 @@ class DataChain:
         Parameters:
             other: chain whose rows will be added to `self`.
         """
+        self_schema = self.signals_schema
+        other_schema = other.signals_schema
+        missing_left, missing_right = self_schema.compare_signals(other_schema)
+        if missing_left or missing_right:
+            raise UnionSchemaMismatchError.from_column_sets(
+                missing_left,
+                missing_right,
+            )
+        self.signals_schema = self_schema.clone_without_sys_signals()
         return self._evolve(query=self._query.union(other._query))
     def subtract(  # type: ignore[override]
         self,
         other: "DataChain",
-        on: Optional[Union[str, Sequence[str]]] = None,
-        right_on: Optional[Union[str, Sequence[str]]] = None,
+        on: str | Sequence[str] | None = None,
+        right_on: str | Sequence[str] | None = None,
     ) -> "Self":
         """Remove rows that appear in another chain.
@@ -1666,6 +1843,7 @@ class DataChain:
                 zip(
                     self.signals_schema.resolve(*on).db_signals(),
                     other.signals_schema.resolve(*right_on).db_signals(),
+                    strict=False,
                 )  # type: ignore[arg-type]
             )
         return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]
@@ -1673,15 +1851,15 @@ class DataChain:
     def diff(
         self,
         other: "DataChain",
-        on: Union[str, Sequence[str]],
-        right_on: Optional[Union[str, Sequence[str]]] = None,
-        compare: Optional[Union[str, Sequence[str]]] = None,
-        right_compare: Optional[Union[str, Sequence[str]]] = None,
+        on: str | Sequence[str],
+        right_on: str | Sequence[str] | None = None,
+        compare: str | Sequence[str] | None = None,
+        right_compare: str | Sequence[str] | None = None,
         added: bool = True,
         deleted: bool = True,
         modified: bool = True,
         same: bool = False,
-        status_col: Optional[str] = None,
+        status_col: str | None = None,
     ) -> "DataChain":
         """Calculate differences between two chains.
@@ -1742,12 +1920,12 @@ class DataChain:
         self,
         other: "DataChain",
         on: str = "file",
-        right_on: Optional[str] = None,
+        right_on: str | None = None,
         added: bool = True,
         modified: bool = True,
         deleted: bool = False,
         same: bool = False,
-        status_col: Optional[str] = None,
+        status_col: str | None = None,
     ) -> "DataChain":
         """Calculate differences between two chains containing files.
@@ -1845,12 +2023,15 @@ class DataChain:
         self,
         flatten: bool = False,
         include_hidden: bool = True,
+        as_object: bool = False,
     ) -> "pd.DataFrame":
         """Return a pandas DataFrame from the chain.
         Parameters:
             flatten: Whether to use a multiindex or flatten column names.
             include_hidden: Whether to include hidden columns.
+            as_object: Whether to emit a dataframe backed by Python objects
+                rather than pandas-inferred dtypes.
         Returns:
             pd.DataFrame: A pandas DataFrame representation of the chain.
@@ -1860,12 +2041,18 @@ class DataChain:
         headers, max_length = self._effective_signals_schema.get_headers_with_length(
             include_hidden=include_hidden
         )
+        columns: list[str] | pd.MultiIndex
         if flatten or max_length < 2:
             columns = [".".join(filter(None, header)) for header in headers]
         else:
             columns = pd.MultiIndex.from_tuples(map(tuple, headers))
         results = self.results(include_hidden=include_hidden)
+        if as_object:
+            df = pd.DataFrame(results, columns=columns, dtype=object)
+            df.where(pd.notna(df), None, inplace=True)
+            return df
         return pd.DataFrame.from_records(results, columns=columns)
     def show(
@@ -1888,7 +2075,11 @@ class DataChain:
         import pandas as pd
         dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
-        df = dc.to_pandas(flatten, include_hidden=include_hidden)
+        df = dc.to_pandas(
+            flatten,
+            include_hidden=include_hidden,
+            as_object=True,
+        )
         if df.empty:
             print("Empty result")
@@ -1947,20 +2138,20 @@ class DataChain:
         column: str = "",
         model_name: str = "",
         source: bool = True,
-        nrows: Optional[int] = None,
-        **kwargs,
+        nrows: int | None = None,
+        **kwargs: Any,
     ) -> "Self":
         """Generate chain from list of tabular files.
         Parameters:
-            output : Dictionary or feature class defining column names and their
+            output: Dictionary or feature class defining column names and their
                 corresponding types. List of column names is also accepted, in which
                 case types will be inferred.
-            column : Generated column name.
-            model_name : Generated model name.
-            source : Whether to include info about the source file.
-            nrows : Optional row limit.
-            kwargs : Parameters to pass to pyarrow.dataset.dataset.
+            column: Generated column name.
+            model_name: Generated model name.
+            source: Whether to include info about the source file.
+            nrows: Optional row limit.
+            kwargs: Parameters to pass to pyarrow.dataset.dataset.
         Example:
             Reading a json lines file:
@@ -2081,23 +2272,23 @@ class DataChain:
     def to_parquet(
         self,
-        path: Union[str, os.PathLike[str], BinaryIO],
-        partition_cols: Optional[Sequence[str]] = None,
+        path: str | os.PathLike[str] | BinaryIO,
+        partition_cols: Sequence[str] | None = None,
         chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
-        fs_kwargs: Optional[dict[str, Any]] = None,
+        fs_kwargs: dict[str, Any] | None = None,
         **kwargs,
     ) -> None:
         """Save chain to parquet file with SignalSchema metadata.
         Parameters:
-            path : Path or a file-like binary object to save the file. This supports
+            path: Path or a file-like binary object to save the file. This supports
                 local paths as well as remote paths, such as s3:// or hf:// with fsspec.
-            partition_cols : Column names by which to partition the dataset.
-            chunk_size : The chunk size of results to read and convert to columnar
+            partition_cols: Column names by which to partition the dataset.
+            chunk_size: The chunk size of results to read and convert to columnar
                 data, to avoid running out of memory.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
         """
         import pyarrow as pa
         import pyarrow.parquet as pq
@@ -2141,7 +2332,7 @@ class DataChain:
             # pyarrow infers the best parquet schema from the python types of
             # the input data.
             table = pa.Table.from_pydict(
-                dict(zip(column_names, chunk)),
+                dict(zip(column_names, chunk, strict=False)),
                 schema=parquet_schema,
             )
@@ -2179,137 +2370,116 @@ class DataChain:
     def to_csv(
         self,
-        path: Union[str, os.PathLike[str]],
+        path: str | os.PathLike[str],
         delimiter: str = ",",
-        fs_kwargs: Optional[dict[str, Any]] = None,
+        fs_kwargs: dict[str, Any] | None = None,
         **kwargs,
-    ) -> None:
-        """Save chain to a csv (comma-separated values) file.
+    ) -> File:
+        """Save chain to a csv (comma-separated values) file and return the stored
+        `File`.
         Parameters:
-            path : Path to save the file. This supports local paths as well as
+            path: Path to save the file. This supports local paths as well as
                 remote paths, such as s3:// or hf:// with fsspec.
-            delimiter : Delimiter to use for the resulting file.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
+            delimiter: Delimiter to use for the resulting file.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
+        Returns:
+            File: The stored file with refreshed metadata (version, etag, size).
         """
         import csv
-        opener = open
-        if isinstance(path, str) and "://" in path:
-            from datachain.client.fsspec import Client
-            fs_kwargs = {
-                **self._query.catalog.client_config,
-                **(fs_kwargs or {}),
-            }
-            client = Client.get_implementation(path)
-            fsspec_fs = client.create_fs(**fs_kwargs)
-            opener = fsspec_fs.open
+        target = File.at(path, session=self.session)
         headers, _ = self._effective_signals_schema.get_headers_with_length()
         column_names = [".".join(filter(None, header)) for header in headers]
-        results_iter = self._leaf_values()
-        with opener(path, "w", newline="") as f:
+        with target.open("w", newline="", client_config=fs_kwargs) as f:
             writer = csv.writer(f, delimiter=delimiter, **kwargs)
             writer.writerow(column_names)
-            for row in results_iter:
+            for row in self._leaf_values():
                 writer.writerow(row)
+        return target
     def to_json(
         self,
-        path: Union[str, os.PathLike[str]],
-        fs_kwargs: Optional[dict[str, Any]] = None,
+        path: str | os.PathLike[str],
+        fs_kwargs: dict[str, Any] | None = None,
         include_outer_list: bool = True,
-    ) -> None:
-        """Save chain to a JSON file.
+    ) -> File:
+        """Save chain to a JSON file and return the stored `File`.
         Parameters:
-            path : Path to save the file. This supports local paths as well as
+            path: Path to save the file. This supports local paths as well as
                 remote paths, such as s3:// or hf:// with fsspec.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
-            include_outer_list : Sets whether to include an outer list for all rows.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
+            include_outer_list: Sets whether to include an outer list for all rows.
                 Setting this to True makes the file valid JSON, while False instead
                 writes in the JSON lines format.
+        Returns:
+            File: The stored file with refreshed metadata (version, etag, size).
         """
-        opener = open
-        if isinstance(path, str) and "://" in path:
-            from datachain.client.fsspec import Client
-            fs_kwargs = {
-                **self._query.catalog.client_config,
-                **(fs_kwargs or {}),
-            }
-            client = Client.get_implementation(path)
-            fsspec_fs = client.create_fs(**fs_kwargs)
-            opener = fsspec_fs.open
+        target = File.at(path, session=self.session)
         headers, _ = self._effective_signals_schema.get_headers_with_length()
-        headers = [list(filter(None, header)) for header in headers]
+        headers = [list(filter(None, h)) for h in headers]
+        with target.open("wb", client_config=fs_kwargs) as f:
+            self._write_json_stream(f, headers, include_outer_list)
+        return target
+    def _write_json_stream(
+        self,
+        f: IO[bytes],
+        headers: list[list[str]],
+        include_outer_list: bool,
+    ) -> None:
         is_first = True
-        with opener(path, "wb") as f:
-            if include_outer_list:
-                # This makes the file JSON instead of JSON lines.
-                f.write(b"[\n")
-            for row in self._leaf_values():
-                if not is_first:
-                    if include_outer_list:
-                        # This makes the file JSON instead of JSON lines.
-                        f.write(b",\n")
-                    else:
-                        f.write(b"\n")
-                else:
-                    is_first = False
-                f.write(
-                    json.dumps(
-                        row_to_nested_dict(headers, row), ensure_ascii=False
-                    ).encode("utf-8")
-                )
-            if include_outer_list:
-                # This makes the file JSON instead of JSON lines.
-                f.write(b"\n]\n")
+        if include_outer_list:
+            f.write(b"[\n")
+        for row in self._leaf_values():
+            if not is_first:
+                f.write(b",\n" if include_outer_list else b"\n")
+            else:
+                is_first = False
+            f.write(
+                json.dumps(
+                    row_to_nested_dict(headers, row),
+                    ensure_ascii=False,
+                ).encode("utf-8")
+            )
+        if include_outer_list:
+            f.write(b"\n]\n")
     def to_jsonl(
         self,
-        path: Union[str, os.PathLike[str]],
-        fs_kwargs: Optional[dict[str, Any]] = None,
-    ) -> None:
+        path: str | os.PathLike[str],
+        fs_kwargs: dict[str, Any] | None = None,
+    ) -> File:
         """Save chain to a JSON lines file.
         Parameters:
-            path : Path to save the file. This supports local paths as well as
+            path: Path to save the file. This supports local paths as well as
                 remote paths, such as s3:// or hf:// with fsspec.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
+        Returns:
+            File: The stored file with refreshed metadata (version, etag, size).
         """
-        self.to_json(path, fs_kwargs, include_outer_list=False)
+        return self.to_json(path, fs_kwargs, include_outer_list=False)
     def to_database(
         self,
         table_name: str,
         connection: "ConnectionType",
         *,
-        batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
-        on_conflict: Optional[str] = None,
-        conflict_columns: Optional[list[str]] = None,
-        column_mapping: Optional[dict[str, Optional[str]]] = None,
+        batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
+        on_conflict: str | None = None,
+        conflict_columns: list[str] | None = None,
+        column_mapping: dict[str, str | None] | None = None,
     ) -> int:
         """Save chain to a database table using a given database connection.
@@ -2328,7 +2498,7 @@ class DataChain:
                 library. If a DBAPI2 object, only sqlite3 is supported. The user is
                 responsible for engine disposal and connection closure for the
                 SQLAlchemy connectable; str connections are closed automatically.
-            batch_rows: Number of rows to insert per batch for optimal performance.
+            batch_size: Number of rows to insert per batch for optimal performance.
                 Larger batches are faster but use more memory. Default: 10,000.
             on_conflict: Strategy for handling duplicate rows (requires table
                 constraints):
@@ -2409,7 +2579,7 @@ class DataChain:
             self,
             table_name,
             connection,
-            batch_rows=batch_rows,
+            batch_size=batch_size,
             on_conflict=on_conflict,
             conflict_columns=conflict_columns,
             column_mapping=column_mapping,
@@ -2545,13 +2715,13 @@ class DataChain:
     def to_storage(
         self,
-        output: Union[str, os.PathLike[str]],
+        output: str | os.PathLike[str],
         signal: str = "file",
         placement: FileExportPlacement = "fullpath",
         link_type: Literal["copy", "symlink"] = "copy",
-        num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
-        anon: Optional[bool] = None,
-        client_config: Optional[dict] = None,
+        num_threads: int | None = EXPORT_FILES_MAX_THREADS,
+        anon: bool | None = None,
+        client_config: dict | None = None,
     ) -> None:
         """Export files from a specified signal to a directory. Files can be
         exported to a local or cloud directory.
@@ -2560,12 +2730,24 @@ class DataChain:
             output: Path to the target directory for exporting files.
             signal: Name of the signal to export files from.
             placement: The method to use for naming exported files.
-                The possible values are: "filename", "etag", "fullpath", and "checksum".
+                The possible values are: "filename", "etag", "fullpath",
+                "filepath", and "checksum".
+                Example path translations for an object located at
+                ``s3://bucket/data/img.jpg`` and exported to ``./out``:
+                - "filename" -> ``./out/img.jpg`` (no directories)
+                - "filepath" -> ``./out/data/img.jpg`` (relative path kept)
+                - "fullpath" -> ``./out/bucket/data/img.jpg`` (remote host kept)
+                - "etag" -> ``./out/<etag>.jpg`` (unique name via object digest)
+                Local sources behave like "filepath" for "fullpath" placement.
+                Relative destinations such as "." or ".." and absolute paths
+                are supported for every strategy.
             link_type: Method to use for exporting files.
                 Falls back to `'copy'` if symlinking fails.
-            num_threads : number of threads to use for exporting files.
-                By default it uses 5 threads.
-            anon: If True, we will treat cloud bucket as public one. Default behavior
+            num_threads: number of threads to use for exporting files.
+                By default, it uses 5 threads.
+            anon: If True, we will treat cloud bucket as a public one. Default behavior
                 depends on the previous session configuration (e.g. happens in the
                 initial `read_storage`) and particular cloud storage client
                 implementation (e.g. S3 fallbacks to anonymous access if no credentials
@@ -2614,8 +2796,20 @@ class DataChain:
         )
     def shuffle(self) -> "Self":
-        """Shuffle the rows of the chain deterministically."""
-        return self.order_by("sys.rand")
+        """Shuffle rows with a best-effort deterministic ordering.
+        This produces repeatable shuffles. Merge and union operations can
+        lead to non-deterministic results. Use order by or save a dataset
+        afterward to guarantee the same result.
+        """
+        query = self._query.clone(new_table=False)
+        query.steps.append(RegenerateSystemColumns(self._query.catalog))
+        chain = self._evolve(
+            query=query,
+            signal_schema=SignalSchema({"sys": Sys}) | self.signals_schema,
+        )
+        return chain.order_by("sys.rand")
     def sample(self, n: int) -> "Self":
         """Return a random sample from the chain.

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl