PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -1,51 +1,69 @@
 import copy
+import hashlib
+import logging
 import os
 import os.path
 import sys
 import warnings
-from collections.abc import Iterator, Sequence
+from collections.abc import Callable, Iterator, Sequence
 from typing import (
     IO,
     TYPE_CHECKING,
     Any,
     BinaryIO,
-    Callable,
     ClassVar,
     Literal,
-    Optional,
     TypeVar,
-    Union,
+    cast,
     overload,
 )
-import orjson
 import sqlalchemy
 from pydantic import BaseModel
+from sqlalchemy.sql.elements import ColumnElement
 from tqdm import tqdm
+from datachain import json, semver
 from datachain.dataset import DatasetRecord
+from datachain.delta import delta_disabled
+from datachain.error import (
+    JobAncestryDepthExceededError,
+    ProjectCreateNotAllowedError,
+    ProjectNotFoundError,
+)
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
+from datachain.job import Job
 from datachain.lib.convert.python_to_sql import python_to_sql
-from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
-from datachain.lib.file import (
-    EXPORT_FILES_MAX_THREADS,
-    ArrowRow,
-    FileExporter,
+from datachain.lib.data_model import (
+    DataModel,
+    DataType,
+    DataValue,
+    StandardType,
+    dict_to_data_model,
 )
+from datachain.lib.file import EXPORT_FILES_MAX_THREADS, ArrowRow, File, FileExporter
 from datachain.lib.file import ExportPlacement as FileExportPlacement
+from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
-from datachain.lib.signal_schema import SignalSchema
+from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
 from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
 from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import DataChainColumnError, DataChainParamsError
+from datachain.project import Project
 from datachain.query import Session
-from datachain.query.dataset import DatasetQuery, PartitionByType
-from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
+from datachain.query.dataset import (
+    DatasetQuery,
+    PartitionByType,
+    RegenerateSystemColumns,
+    UnionSchemaMismatchError,
+)
+from datachain.query.schema import DEFAULT_DELIMITER, Column
 from datachain.sql.functions import path as pathfunc
-from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
+from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
+from .database import DEFAULT_DATABASE_BATCH_SIZE
 from .utils import (
     DatasetMergeError,
     DatasetPrepareError,
@@ -54,9 +72,12 @@ from .utils import (
     Sys,
     _get_merge_error_str,
     _validate_merge_on,
+    is_studio,
     resolve_columns,
 )
+logger = logging.getLogger("datachain")
 C = Column
 _T = TypeVar("_T")
@@ -65,11 +86,27 @@ UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
 DEFAULT_PARQUET_CHUNK_SIZE = 100_000
 if TYPE_CHECKING:
+    import sqlite3
     import pandas as pd
+    from sqlalchemy.orm import Session as OrmSession
     from typing_extensions import ParamSpec, Self
     P = ParamSpec("P")
+    ConnectionType = (
+        str
+        | sqlalchemy.engine.URL
+        | sqlalchemy.engine.interfaces.Connectable
+        | sqlalchemy.engine.Engine
+        | sqlalchemy.engine.Connection
+        | OrmSession
+        | sqlite3.Connection
+    )
+T = TypeVar("T", bound="DataChain")
 class DataChain:
     """DataChain - a data structure for batch data processing and evaluation.
@@ -133,7 +170,7 @@ class DataChain:
                 .choices[0]
                 .message.content,
             )
-            .save()
+            .persist()
         )
         try:
@@ -154,7 +191,7 @@ class DataChain:
         query: DatasetQuery,
         settings: Settings,
         signal_schema: SignalSchema,
-        setup: Optional[dict] = None,
+        setup: dict | None = None,
         _sys: bool = False,
     ) -> None:
         """Don't instantiate this directly, use one of the from_XXX constructors."""
@@ -163,6 +200,12 @@ class DataChain:
         self.signals_schema = signal_schema
         self._setup: dict = setup or {}
         self._sys = _sys
+        self._delta = False
+        self._delta_unsafe = False
+        self._delta_on: str | Sequence[str] | None = None
+        self._delta_result_on: str | Sequence[str] | None = None
+        self._delta_compare: str | Sequence[str] | None = None
+        self._delta_retry: bool | str | None = None
     def __repr__(self) -> str:
         """Return a string representation of the chain."""
@@ -176,6 +219,48 @@ class DataChain:
         self.print_schema(file=file)
         return file.getvalue()
+    def hash(self) -> str:
+        """
+        Calculates SHA hash of this chain. Hash calculation is fast and consistent.
+        It takes into account all the steps added to the chain and their inputs.
+        Order of the steps is important.
+        """
+        return self._query.hash()
+    def _as_delta(
+        self,
+        on: str | Sequence[str] | None = None,
+        right_on: str | Sequence[str] | None = None,
+        compare: str | Sequence[str] | None = None,
+        delta_retry: bool | str | None = None,
+        delta_unsafe: bool = False,
+    ) -> "Self":
+        """Marks this chain as delta, which means special delta process will be
+        called on saving dataset for optimization"""
+        if on is None:
+            raise ValueError("'delta on' fields must be defined")
+        self._delta = True
+        self._delta_on = on
+        self._delta_result_on = right_on
+        self._delta_compare = compare
+        self._delta_retry = delta_retry
+        self._delta_unsafe = delta_unsafe
+        return self
+    @property
+    def empty(self) -> bool:
+        """Returns True if chain has zero number of rows"""
+        return not bool(self.count())
+    @property
+    def delta(self) -> bool:
+        """Returns True if this chain is ran in "delta" update mode"""
+        return self._delta
+    @property
+    def delta_unsafe(self) -> bool:
+        return self._delta_unsafe
     @property
     def schema(self) -> dict[str, DataType]:
         """Get schema of the chain."""
@@ -197,7 +282,7 @@ class DataChain:
         raise ValueError(f"Column with name {name} not found in the schema")
-    def c(self, column: Union[str, Column]) -> Column:
+    def c(self, column: str | Column) -> Column:
         """Returns Column instance attached to the current chain."""
         c = self.column(column) if isinstance(column, str) else self.column(column.name)
         c.table = self._query.table
@@ -209,27 +294,31 @@ class DataChain:
         return self._query.session
     @property
-    def name(self) -> Optional[str]:
+    def name(self) -> str | None:
         """Name of the underlying dataset, if there is one."""
         return self._query.name
     @property
-    def version(self) -> Optional[int]:
+    def version(self) -> str | None:
         """Version of the underlying dataset, if there is one."""
         return self._query.version
     @property
-    def dataset(self) -> Optional[DatasetRecord]:
+    def dataset(self) -> DatasetRecord | None:
         """Underlying dataset, if there is one."""
         if not self.name:
             return None
-        return self.session.catalog.get_dataset(self.name)
+        return self.session.catalog.get_dataset(
+            self.name,
+            namespace_name=self._query.project.namespace.name,
+            project_name=self._query.project.name,
+        )
     def __or__(self, other: "Self") -> "Self":
         """Return `self.union(other)`."""
         return self.union(other)
-    def print_schema(self, file: Optional[IO] = None) -> None:
+    def print_schema(self, file: IO | None = None) -> None:
         """Print schema of the chain."""
         self._effective_signals_schema.print_tree(file=file)
@@ -240,8 +329,8 @@ class DataChain:
     def _evolve(
         self,
         *,
-        query: Optional[DatasetQuery] = None,
-        settings: Optional[Settings] = None,
+        query: DatasetQuery | None = None,
+        settings: Settings | None = None,
         signal_schema=None,
         _sys=None,
     ) -> "Self":
@@ -253,39 +342,60 @@ class DataChain:
             signal_schema = copy.deepcopy(self.signals_schema)
         if _sys is None:
             _sys = self._sys
-        return type(self)(
+        chain = type(self)(
             query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
         )
+        if self.delta:
+            chain = chain._as_delta(
+                on=self._delta_on,
+                right_on=self._delta_result_on,
+                compare=self._delta_compare,
+                delta_retry=self._delta_retry,
+                delta_unsafe=self._delta_unsafe,
+            )
+        return chain
     def settings(
         self,
-        cache=None,
-        parallel=None,
-        workers=None,
-        min_task_size=None,
-        prefetch: Optional[int] = None,
-        sys: Optional[bool] = None,
+        cache: bool | None = None,
+        prefetch: bool | int | None = None,
+        parallel: bool | int | None = None,
+        workers: int | None = None,
+        namespace: str | None = None,
+        project: str | None = None,
+        min_task_size: int | None = None,
+        batch_size: int | None = None,
+        sys: bool | None = None,
     ) -> "Self":
-        """Change settings for chain.
-        This function changes specified settings without changing not specified ones.
-        It returns chain, so, it can be chained later with next operation.
+        """
+        Set chain execution parameters. Returns the chain itself, allowing method
+        chaining for subsequent operations. To restore all settings to their default
+        values, use `reset_settings()`.
         Parameters:
-            cache : data caching (default=False)
-            parallel : number of thread for processors. True is a special value to
-                enable all available CPUs (default=1)
-            workers : number of distributed workers. Only for Studio mode. (default=1)
-            min_task_size : minimum number of tasks (default=1)
-            prefetch: number of workers to use for downloading files in advance.
-                      This is enabled by default and uses 2 workers.
-                      To disable prefetching, set it to 0.
+            cache: Enable files caching to speed up subsequent accesses to the same
+                files from the same or different chains. Defaults to False.
+            prefetch: Enable prefetching of files. This will download files in
+                advance in parallel. If an integer is provided, it specifies the number
+                of files to prefetch concurrently for each process on each worker.
+                Defaults to 2. Set to 0 or False to disable prefetching.
+            parallel: Number of processes to use for processing user-defined functions
+                (UDFs) in parallel. If an integer is provided, it specifies the number
+                of CPUs to use. If True, all available CPUs are used. Defaults to 1.
+            namespace: Namespace to use for the chain by default.
+            project: Project to use for the chain by default.
+            min_task_size: Minimum number of rows per worker/process for parallel
+                processing by UDFs. Defaults to 1.
+            batch_size: Number of rows per insert by UDF to fine tune and balance speed
+                and memory usage. This might be useful when processing large rows
+                or when running into memory issues. Defaults to 2000.
         Example:
             ```py
             chain = (
                 chain
-                .settings(cache=True, parallel=8)
+                .settings(cache=True, parallel=8, batch_size=300)
                 .map(laion=process_webdataset(spec=WDSLaion), params="file")
             )
             ```
@@ -293,22 +403,25 @@ class DataChain:
         if sys is None:
             sys = self._sys
         settings = copy.copy(self._settings)
-        settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
+        settings.add(
+            Settings(
+                cache=cache,
+                prefetch=prefetch,
+                parallel=parallel,
+                workers=workers,
+                namespace=namespace,
+                project=project,
+                min_task_size=min_task_size,
+                batch_size=batch_size,
+            )
+        )
         return self._evolve(settings=settings, _sys=sys)
-    def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
-        """Reset all settings to default values."""
+    def reset_settings(self, settings: Settings | None = None) -> "Self":
+        """Reset all chain settings to default values."""
         self._settings = settings if settings else Settings()
         return self
-    def reset_schema(self, signals_schema: SignalSchema) -> "Self":
-        self.signals_schema = signals_schema
-        return self
-    def add_schema(self, signals_schema: SignalSchema) -> "Self":
-        self.signals_schema |= signals_schema
-        return self
     @classmethod
     def from_storage(
         cls,
@@ -356,8 +469,8 @@ class DataChain:
     def explode(
         self,
         col: str,
-        model_name: Optional[str] = None,
-        object_name: Optional[str] = None,
+        model_name: str | None = None,
+        column: str | None = None,
         schema_sample_size: int = 1,
     ) -> "DataChain":
         """Explodes a column containing JSON objects (dict or str DataChain type) into
@@ -368,7 +481,7 @@ class DataChain:
             col: the name of the column containing JSON to be exploded.
             model_name: optional generated model name.  By default generates the name
                 automatically.
-            object_name: optional generated object column name. By default generates the
+            column: optional generated column name. By default generates the
                 name automatically.
             schema_sample_size: the number of rows to use for inferring the schema of
                 the JSON (in case some fields are optional and it's not enough to
@@ -377,16 +490,14 @@ class DataChain:
         Returns:
             DataChain: A new DataChain instance with the new set of columns.
         """
-        import json
         import pyarrow as pa
         from datachain.lib.arrow import schema_to_output
-        json_values = list(self.limit(schema_sample_size).collect(col))
+        json_values = self.limit(schema_sample_size).to_list(col)
         json_dicts = [
             json.loads(json_value) if isinstance(json_value, str) else json_value
-            for json_value in json_values
+            for (json_value,) in json_values
         ]
         if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -400,16 +511,16 @@ class DataChain:
         model = dict_to_data_model(model_name, output, original_names)
-        def json_to_model(json_value: Union[str, dict]):
+        def json_to_model(json_value: str | dict):
             json_dict = (
                 json.loads(json_value) if isinstance(json_value, str) else json_value
             )
             return model.model_validate(json_dict)
-        if not object_name:
-            object_name = f"{col}_expl"
+        if not column:
+            column = f"{col}_expl"
-        return self.map(json_to_model, params=col, output={object_name: model})
+        return self.map(json_to_model, params=col, output={column: model})
     @classmethod
     def datasets(
@@ -443,35 +554,290 @@ class DataChain:
         )
         return listings(*args, **kwargs)
+    @property
+    def namespace_name(self) -> str:
+        """Current namespace name in which the chain is running"""
+        return (
+            self._settings.namespace
+            or self.session.catalog.metastore.default_namespace_name
+        )
+    @property
+    def project_name(self) -> str:
+        """Current project name in which the chain is running"""
+        return (
+            self._settings.project
+            or self.session.catalog.metastore.default_project_name
+        )
+    def persist(self) -> "Self":
+        """Saves temporary chain that will be removed after the process ends.
+        Temporary datasets are useful for optimization, for example when we have
+        multiple chains starting with identical sub-chain. We can then persist that
+        common chain and use it to calculate other chains, to avoid re-calculation
+        every time.
+        It returns the chain itself.
+        """
+        schema = self.signals_schema.clone_without_sys_signals().serialize()
+        project = self.session.catalog.metastore.get_project(
+            self.project_name,
+            self.namespace_name,
+            create=True,
+        )
+        return self._evolve(
+            query=self._query.save(project=project, feature_schema=schema),
+            signal_schema=self.signals_schema | SignalSchema({"sys": Sys}),
+        )
+    def _calculate_job_hash(self, job_id: str) -> str:
+        """
+        Calculates hash of the job at the place of this chain's save method.
+        Hash is calculated using previous job checkpoint hash (if exists) and
+        adding hash of this chain to produce new hash.
+        """
+        last_checkpoint = self.session.catalog.metastore.get_last_checkpoint(job_id)
+        return hashlib.sha256(
+            (bytes.fromhex(last_checkpoint.hash) if last_checkpoint else b"")
+            + bytes.fromhex(self.hash())
+        ).hexdigest()
     def save(  # type: ignore[override]
         self,
-        name: Optional[str] = None,
-        version: Optional[int] = None,
-        description: Optional[str] = None,
-        labels: Optional[list[str]] = None,
+        name: str,
+        version: str | None = None,
+        description: str | None = None,
+        attrs: list[str] | None = None,
+        update_version: str | None = "patch",
         **kwargs,
-    ) -> "Self":
+    ) -> "DataChain":
         """Save to a Dataset. It returns the chain itself.
         Parameters:
-            name : dataset name. Empty name saves to a temporary dataset that will be
-                removed after process ends. Temp dataset are useful for optimization.
-            version : version of a dataset. Default - the last version that exist.
-            description : description of a dataset.
-            labels : labels of a dataset.
+            name: dataset name. This can be either a fully qualified name, including
+                the namespace and project, or just a regular dataset name. In the latter
+                case, the namespace and project will be taken from the settings
+                (if specified) or from the default values otherwise.
+            version: version of a dataset. If version is not specified and dataset
+                already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
+            description: description of a dataset.
+            attrs: attributes of a dataset. They can be without value, e.g "NLP",
+                or with a value, e.g "location=US".
+            update_version: which part of the dataset version to automatically increase.
+                Available values: `major`, `minor` or `patch`. Default is `patch`.
         """
+        catalog = self.session.catalog
+        result = None  # result chain that will be returned at the end
+        # Version validation
+        self._validate_version(version)
+        self._validate_update_version(update_version)
+        # get existing job if running in SaaS, or creating new one if running locally
+        job = self.session.get_or_create_job()
+        namespace_name, project_name, name = catalog.get_full_dataset_name(
+            name,
+            namespace_name=self._settings.namespace,
+            project_name=self._settings.project,
+        )
+        project = self._get_or_create_project(namespace_name, project_name)
+        # Checkpoint handling
+        _hash, result = self._resolve_checkpoint(name, project, job, kwargs)
+        if bool(result):
+            # Checkpoint was found and reused
+            print(f"Checkpoint found for dataset '{name}', skipping creation")
+        # Schema preparation
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        return self._evolve(
-            query=self._query.save(
-                name=name,
-                version=version,
-                description=description,
-                labels=labels,
-                feature_schema=schema,
+        # Handle retry and delta functionality
+        if not result:
+            result = self._handle_delta(name, version, project, schema, kwargs)
+        if not result:
+            # calculate chain if we already don't have result from checkpoint or delta
+            result = self._evolve(
+                query=self._query.save(
+                    name=name,
+                    version=version,
+                    project=project,
+                    description=description,
+                    attrs=attrs,
+                    feature_schema=schema,
+                    update_version=update_version,
+                    **kwargs,
+                )
+            )
+        catalog.metastore.create_checkpoint(job.id, _hash)  # type: ignore[arg-type]
+        return result
+    def _validate_version(self, version: str | None) -> None:
+        """Validate dataset version if provided."""
+        if version is not None:
+            semver.validate(version)
+    def _validate_update_version(self, update_version: str | None) -> None:
+        """Ensure update_version is one of: major, minor, patch."""
+        allowed = ["major", "minor", "patch"]
+        if update_version not in allowed:
+            raise ValueError(f"update_version must be one of {allowed}")
+    def _get_or_create_project(self, namespace: str, project_name: str) -> Project:
+        """Get project or raise if creation not allowed."""
+        try:
+            return self.session.catalog.metastore.get_project(
+                project_name,
+                namespace,
+                create=is_studio(),
+            )
+        except ProjectNotFoundError as e:
+            raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
+    def _resolve_checkpoint(
+        self,
+        name: str,
+        project: Project,
+        job: Job,
+        kwargs: dict,
+    ) -> tuple[str, "DataChain | None"]:
+        """Check if checkpoint exists and return cached dataset if possible."""
+        from .datasets import read_dataset
+        metastore = self.session.catalog.metastore
+        checkpoints_reset = env2bool("DATACHAIN_CHECKPOINTS_RESET", undefined=True)
+        _hash = self._calculate_job_hash(job.id)
+        if (
+            job.parent_job_id
+            and not checkpoints_reset
+            and metastore.find_checkpoint(job.parent_job_id, _hash)
+        ):
+            # checkpoint found → find which dataset version to reuse
+            # Find dataset version that was created by any ancestor job
+            try:
+                dataset_version = metastore.get_dataset_version_for_job_ancestry(
+                    name,
+                    project.namespace.name,
+                    project.name,
+                    job.id,
+                )
+            except JobAncestryDepthExceededError:
+                raise JobAncestryDepthExceededError(
+                    "Job continuation chain is too deep. "
+                    "Please run the job from scratch without continuing from a "
+                    "parent job."
+                ) from None
+            if not dataset_version:
+                logger.debug(
+                    "Checkpoint found but no dataset version for '%s' "
+                    "in job ancestry (job_id=%s). Creating new version.",
+                    name,
+                    job.id,
+                )
+                # Dataset version not found (e.g deleted by user) - skip
+                # checkpoint and recreate
+                return _hash, None
+            logger.debug(
+                "Reusing dataset version '%s' v%s from job ancestry "
+                "(job_id=%s, dataset_version_id=%s)",
+                name,
+                dataset_version.version,
+                job.id,
+                dataset_version.id,
+            )
+            # Read the specific version from ancestry
+            chain = read_dataset(
+                name,
+                namespace=project.namespace.name,
+                project=project.name,
+                version=dataset_version.version,
                 **kwargs,
             )
+            # Link current job to this dataset version (not creator).
+            # This also updates dataset_version.job_id.
+            metastore.link_dataset_version_to_job(
+                dataset_version.id,
+                job.id,
+                is_creator=False,
+            )
+            return _hash, chain
+        return _hash, None
+    def _handle_delta(
+        self,
+        name: str,
+        version: str | None,
+        project: Project,
+        schema: dict,
+        kwargs: dict,
+    ) -> "DataChain | None":
+        """Try to save as a delta dataset.
+        Returns:
+            A DataChain if delta logic could handle it, otherwise None to fall back
+            to the regular save path (e.g., on first dataset creation).
+        """
+        from datachain.delta import delta_retry_update
+        from .datasets import read_dataset
+        if not self.delta or not name:
+            return None
+        assert self._delta_on is not None, "Delta chain must have delta_on defined"
+        result_ds, dependencies, has_changes = delta_retry_update(
+            self,
+            project.namespace.name,
+            project.name,
+            name,
+            on=self._delta_on,
+            right_on=self._delta_result_on,
+            compare=self._delta_compare,
+            delta_retry=self._delta_retry,
         )
+        # Case 1: delta produced a new dataset
+        if result_ds:
+            return self._evolve(
+                query=result_ds._query.save(
+                    name=name,
+                    version=version,
+                    project=project,
+                    feature_schema=schema,
+                    dependencies=dependencies,
+                    **kwargs,
+                )
+            )
+        # Case 2: no changes → reuse last version
+        if not has_changes:
+            # sources have not been changed so new version of resulting dataset
+            # would be the same as previous one. To avoid duplicating exact
+            # datasets, we won't create new version of it and we will return
+            # current latest version instead.
+            return read_dataset(
+                name,
+                namespace=project.namespace.name,
+                project=project.name,
+                **kwargs,
+            )
+        # Case 3: first creation of dataset
+        return None
     def apply(self, func, *args, **kwargs):
         """Apply any function to the chain.
@@ -497,10 +863,10 @@ class DataChain:
     def map(
         self,
-        func: Optional[Callable] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
-        **signal_map,
+        **signal_map: Any,
     ) -> "Self":
         """Apply a function to each row to create new signals. The function should
         return a new object for each row. It returns a chain itself with new signals.
@@ -508,17 +874,17 @@ class DataChain:
         Input-output relationship: 1:1
         Parameters:
-            func : Function applied to each row.
-            params : List of column names used as input for the function. Default
+            func: Function applied to each row.
+            params: List of column names used as input for the function. Default
                     is taken from function signature.
-            output : Dictionary defining new signals and their corresponding types.
+            output: Dictionary defining new signals and their corresponding types.
                     Default type is taken from function signature. Default can be also
                     taken from kwargs - **signal_map (see below).
                     If signal name is defined using signal_map (see below) only a single
                     type value can be used.
-            **signal_map : kwargs can be used to define `func` together with it's return
+            **signal_map: kwargs can be used to define `func` together with its return
                     signal name in format of `map(my_sign=my_func)`. This helps define
-                    signal names and function in a nicer way.
+                    signal names and functions in a nicer way.
         Example:
             Using signal_map and single type in output:
@@ -539,18 +905,19 @@ class DataChain:
         if (prefetch := self._settings.prefetch) is not None:
             udf_obj.prefetch = prefetch
+        sys_schema = SignalSchema({"sys": Sys})
         return self._evolve(
             query=self._query.add_signals(
-                udf_obj.to_udf_wrapper(),
+                udf_obj.to_udf_wrapper(self._settings.batch_size),
                 **self._settings.to_dict(),
             ),
-            signal_schema=self.signals_schema | udf_obj.output,
+            signal_schema=sys_schema | self.signals_schema | udf_obj.output,
         )
     def gen(
         self,
-        func: Optional[Union[Callable, Generator]] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | Generator | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
         **signal_map,
     ) -> "Self":
@@ -579,19 +946,21 @@ class DataChain:
             udf_obj.prefetch = prefetch
         return self._evolve(
             query=self._query.generate(
-                udf_obj.to_udf_wrapper(),
+                udf_obj.to_udf_wrapper(self._settings.batch_size),
                 **self._settings.to_dict(),
             ),
-            signal_schema=udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
         )
+    @delta_disabled
     def agg(
         self,
-        func: Optional[Callable] = None,
-        partition_by: Optional[PartitionByType] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        /,
+        func: Callable | None = None,
+        partition_by: PartitionByType | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
-        **signal_map,
+        **signal_map: Callable,
     ) -> "Self":
         """Aggregate rows using `partition_by` statement and apply a function to the
         groups of aggregated rows. The function needs to return new objects for each
@@ -601,12 +970,28 @@ class DataChain:
         This method bears similarity to `gen()` and `map()`, employing a comparable set
         of parameters, yet differs in two crucial aspects:
         1. The `partition_by` parameter: This specifies the column name or a list of
            column names that determine the grouping criteria for aggregation.
         2. Group-based UDF function input: Instead of individual rows, the function
-           receives a list all rows within each group defined by `partition_by`.
+           receives a list of all rows within each group defined by `partition_by`.
+        If `partition_by` is not set or is an empty list, all rows will be placed
+        into a single group.
+        Parameters:
+            func: Function applied to each group of rows.
+            partition_by: Column name(s) to group by. If None, all rows go
+                into one group.
+            params: List of column names used as input for the function. Default is
+                taken from function signature.
+            output: Dictionary defining new signals and their corresponding types.
+                Default type is taken from function signature.
+            **signal_map: kwargs can be used to define `func` together with its return
+                signal name in format of `agg(result_column=my_func)`.
         Examples:
+            Basic aggregation with lambda function:
             ```py
             chain = chain.agg(
                 total=lambda category, amount: [sum(amount)],
@@ -617,7 +1002,6 @@ class DataChain:
             ```
             An alternative syntax, when you need to specify a more complex function:
             ```py
             # It automatically resolves which columns to pass to the function
             # by looking at the function signature.
@@ -635,21 +1019,80 @@ class DataChain:
             )
             chain.save("new_dataset")
             ```
+            Using complex signals for partitioning (`File` or any Pydantic `BaseModel`):
+            ```py
+            def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
+                yield files[0], sum(f.size for f in files)
+            chain = chain.agg(
+                my_agg,
+                params=("file",),
+                output={"file": File, "total": int},
+                partition_by="file",  # Column referring to all sub-columns of File
+            )
+            chain.save("new_dataset")
+            ```
+            Aggregating all rows into a single group (when `partition_by` is not set):
+            ```py
+            chain = chain.agg(
+                total_size=lambda file, size: [sum(size)],
+                output=int,
+                # No partition_by specified - all rows go into one group
+            )
+            chain.save("new_dataset")
+            ```
+            Multiple partition columns:
+            ```py
+            chain = chain.agg(
+                total=lambda category, subcategory, amount: [sum(amount)],
+                output=float,
+                partition_by=["category", "subcategory"],
+            )
+            chain.save("new_dataset")
+            ```
         """
+        if partition_by is not None:
+            # Convert string partition_by parameters to Column objects
+            if isinstance(partition_by, (str, Function, ColumnElement)):
+                list_partition_by = [partition_by]
+            else:
+                list_partition_by = list(partition_by)
+            processed_partition_columns: list[ColumnElement] = []
+            for col in list_partition_by:
+                if isinstance(col, str):
+                    columns = self.signals_schema.db_signals(name=col, as_columns=True)
+                    if not columns:
+                        raise SignalResolvingError([col], "is not found")
+                    processed_partition_columns.extend(cast("list[Column]", columns))
+                elif isinstance(col, Function):
+                    column = col.get_column(self.signals_schema)
+                    processed_partition_columns.append(column)
+                else:
+                    # Assume it's already a ColumnElement
+                    processed_partition_columns.append(col)
+            processed_partition_by = processed_partition_columns
+        else:
+            processed_partition_by = []
         udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
         return self._evolve(
             query=self._query.generate(
-                udf_obj.to_udf_wrapper(),
-                partition_by=partition_by,
+                udf_obj.to_udf_wrapper(self._settings.batch_size),
+                partition_by=processed_partition_by,
                 **self._settings.to_dict(),
             ),
-            signal_schema=udf_obj.output,
+            signal_schema=SignalSchema({"sys": Sys}) | udf_obj.output,
         )
     def batch_map(
         self,
-        func: Optional[Callable] = None,
-        params: Union[None, str, Sequence[str]] = None,
+        func: Callable | None = None,
+        params: str | Sequence[str] | None = None,
         output: OutputType = None,
         batch: int = 1000,
         **signal_map,
@@ -661,7 +1104,7 @@ class DataChain:
         It accepts the same parameters plus an
         additional parameter:
-            batch : Size of each batch passed to `func`. Defaults to 1000.
+            batch: Size of each batch passed to `func`. Defaults to 1000.
         Example:
             ```py
@@ -671,11 +1114,24 @@ class DataChain:
             )
             chain.save("new_dataset")
             ```
+        .. deprecated:: 0.29.0
+            This method is deprecated and will be removed in a future version.
+            Use `agg()` instead, which provides the similar functionality.
         """
+        import warnings
+        warnings.warn(
+            "batch_map() is deprecated and will be removed in a future version. "
+            "Use agg() instead, which provides the similar functionality.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
         return self._evolve(
             query=self._query.add_signals(
-                udf_obj.to_udf_wrapper(batch),
+                udf_obj.to_udf_wrapper(self._settings.batch_size, batch=batch),
                 **self._settings.to_dict(),
             ),
             signal_schema=self.signals_schema | udf_obj.output,
@@ -684,8 +1140,8 @@ class DataChain:
     def _udf_to_obj(
         self,
         target_class: type[UDFObjT],
-        func: Optional[Union[Callable, UDFObjT]],
-        params: Union[None, str, Sequence[str]],
+        func: Callable | UDFObjT | None,
+        params: str | Sequence[str] | None,
         output: OutputType,
         signal_map: dict[str, Callable],
     ) -> UDFObjT:
@@ -696,11 +1152,7 @@ class DataChain:
         sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
         DataModel.register(list(sign.output_schema.values.values()))
-        signals_schema = self.signals_schema
-        if self._sys:
-            signals_schema = SignalSchema({"sys": Sys}) | signals_schema
-        params_schema = signals_schema.slice(
+        params_schema = self.signals_schema.slice(
             sign.params, self._setup, is_batch=is_batch
         )
@@ -710,7 +1162,7 @@ class DataChain:
         query_func = getattr(self._query, method_name)
         new_schema = self.signals_schema.resolve(*args)
-        columns = [C(col) for col in new_schema.db_signals()]
+        columns = new_schema.db_signals(as_columns=True)
         return query_func(*columns, **kwargs)
     @resolve_columns
@@ -729,15 +1181,17 @@ class DataChain:
             Order is not guaranteed when steps are added after an `order_by` statement.
             I.e. when using `read_dataset` an `order_by` statement should be used if
             the order of the records in the chain is important.
-            Using `order_by` directly before `limit`, `collect` and `collect_flatten`
+            Using `order_by` directly before `limit`, `to_list` and similar methods
             will give expected results.
-            See https://github.com/iterative/datachain/issues/477 for further details.
+            See https://github.com/datachain-ai/datachain/issues/477
+            for further details.
         """
         if descending:
             args = tuple(sqlalchemy.desc(a) for a in args)
         return self._evolve(query=self._query.order_by(*args))
+    @delta_disabled
     def distinct(self, arg: str, *args: str) -> "Self":  # type: ignore[override]
         """Removes duplicate rows based on uniqueness of some input column(s)
         i.e if rows are found with the same value of input column(s), only one
@@ -745,7 +1199,7 @@ class DataChain:
         Example:
             ```py
-            dc.distinct("file.parent", "file.name")
+            dc.distinct("file.path")
             ```
         """
         return self._evolve(
@@ -754,11 +1208,9 @@ class DataChain:
             )
         )
-    def select(self, *args: str, _sys: bool = True) -> "Self":
+    def select(self, *args: str) -> "Self":
         """Select only a specified set of signals."""
         new_schema = self.signals_schema.resolve(*args)
-        if self._sys and _sys:
-            new_schema = SignalSchema({"sys": Sys}) | new_schema
         columns = new_schema.db_signals()
         return self._evolve(
             query=self._query.select(*columns), signal_schema=new_schema
@@ -772,10 +1224,11 @@ class DataChain:
             query=self._query.select(*columns), signal_schema=new_schema
         )
-    def group_by(
+    @delta_disabled  # type: ignore[arg-type]
+    def group_by(  # noqa: C901, PLR0912
         self,
         *,
-        partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
+        partition_by: str | Func | Sequence[str | Func] | None = None,
         **kwargs: Func,
     ) -> "Self":
         """Group rows by specified set of signals and return new signals
@@ -791,6 +1244,15 @@ class DataChain:
                 partition_by=("file_source", "file_ext"),
             )
             ```
+            Using complex signals:
+            ```py
+            chain = chain.group_by(
+                total_size=func.sum("file.size"),
+                count=func.count(),
+                partition_by="file",  # Uses column name, expands to File's unique keys
+            )
+            ```
         """
         if partition_by is None:
             partition_by = []
@@ -801,20 +1263,61 @@ class DataChain:
         signal_columns: list[Column] = []
         schema_fields: dict[str, DataType] = {}
         keep_columns: list[str] = []
+        partial_fields: list[str] = []  # Track specific fields for partial creation
+        schema_partition_by: list[str] = []
-        # validate partition_by columns and add them to the schema
         for col in partition_by:
             if isinstance(col, str):
-                col_db_name = ColumnMeta.to_db_name(col)
-                col_type = self.signals_schema.get_column_type(col_db_name)
-                column = Column(col_db_name, python_to_sql(col_type))
-                if col not in keep_columns:
-                    keep_columns.append(col)
+                columns = self.signals_schema.db_signals(name=col, as_columns=True)
+                if not columns:
+                    raise SignalResolvingError([col], "is not found")
+                partition_by_columns.extend(cast("list[Column]", columns))
+                # For nested field references (e.g., "nested.level1.name"),
+                # we need to distinguish between:
+                # 1. References to fields within a complex signal (create partials)
+                # 2. Deep nested references that should be flattened
+                if "." in col:
+                    # Split the column reference to analyze it
+                    parts = col.split(".")
+                    parent_signal = parts[0]
+                    parent_type = self.signals_schema.values.get(parent_signal)
+                    if ModelStore.is_partial(parent_type):
+                        if parent_signal not in keep_columns:
+                            keep_columns.append(parent_signal)
+                        partial_fields.append(col)
+                        schema_partition_by.append(col)
+                    else:
+                        # BaseModel or other - add flattened columns directly
+                        for column in cast("list[Column]", columns):
+                            col_type = self.signals_schema.get_column_type(column.name)
+                            schema_fields[column.name] = col_type
+                        schema_partition_by.append(col)
+                else:
+                    # simple signal - but we need to check if it's a complex signal
+                    # complex signal - only include the columns used for partitioning
+                    col_type = self.signals_schema.get_column_type(
+                        col, with_subtree=True
+                    )
+                    if isinstance(col_type, type) and issubclass(col_type, BaseModel):
+                        # Complex signal - add only the partitioning columns
+                        for column in cast("list[Column]", columns):
+                            col_type = self.signals_schema.get_column_type(column.name)
+                            schema_fields[column.name] = col_type
+                        schema_partition_by.append(col)
+                    # Simple signal - keep the entire signal
+                    else:
+                        if col not in keep_columns:
+                            keep_columns.append(col)
+                        schema_partition_by.append(col)
             elif isinstance(col, Function):
                 column = col.get_column(self.signals_schema)
                 col_db_name = column.name
                 col_type = column.type.python_type
                 schema_fields[col_db_name] = col_type
+                partition_by_columns.append(column)
+                signal_columns.append(column)
             else:
                 raise DataChainColumnError(
                     col,
@@ -823,9 +1326,7 @@ class DataChain:
                         " but expected str or Function"
                     ),
                 )
-            partition_by_columns.append(column)
-        # validate signal columns and add them to the schema
         if not kwargs:
             raise ValueError("At least one column should be provided for group_by")
         for col_name, func in kwargs.items():
@@ -838,9 +1339,9 @@ class DataChain:
             signal_columns.append(column)
             schema_fields[col_name] = func.get_result_type(self.signals_schema)
-        signal_schema = SignalSchema(schema_fields)
-        if keep_columns:
-            signal_schema |= self.signals_schema.to_partial(*keep_columns)
+        signal_schema = self.signals_schema.group_by(
+            schema_partition_by, signal_columns
+        )
         return self._evolve(
             query=self._query.group_by(signal_columns, partition_by_columns),
@@ -848,17 +1349,13 @@ class DataChain:
         )
     def mutate(self, **kwargs) -> "Self":
-        """Create new signals based on existing signals.
-        This method cannot modify existing columns. If you need to modify an
-        existing column, use a different name for the new column and then use
-        `select()` to choose which columns to keep.
+        """Create or modify signals based on existing signals.
         This method is vectorized and more efficient compared to map(), and it does not
         extract or download any data from the internal database. However, it can only
         utilize predefined built-in functions and their combinations.
-        The supported functions:
+        Supported functions:
            Numerical:   +, -, *, /, rand(), avg(), count(), func(),
                         greatest(), least(), max(), min(), sum()
            String:      length(), split(), replace(), regexp_replace()
@@ -871,7 +1368,7 @@ class DataChain:
         ```py
          dc.mutate(
             area=Column("image.height") * Column("image.width"),
-            extension=file_ext(Column("file.name")),
+            extension=file_ext(Column("file.path")),
             dist=cosine_distance(embedding_text, embedding_image)
         )
         ```
@@ -885,13 +1382,20 @@ class DataChain:
         ```
         This method can be also used to rename signals. If the Column("name") provided
-        as value for the new signal - the old column will be dropped. Otherwise a new
-        column is created.
+        as value for the new signal - the old signal will be dropped. Otherwise a new
+        signal is created. Exception, if the old signal is nested one (e.g.
+        `C("file.path")`), it will be kept to keep the object intact.
         Example:
         ```py
          dc.mutate(
-            newkey=Column("oldkey")
+            newkey=Column("oldkey") # drops oldkey
+        )
+        ```
+        ```py
+         dc.mutate(
+            size=Column("file.size") # keeps `file.size`
         )
         ```
         """
@@ -926,49 +1430,52 @@ class DataChain:
                 # adding new signal
                 mutated[name] = value
+        new_schema = schema.mutate(kwargs)
         return self._evolve(
-            query=self._query.mutate(**mutated), signal_schema=schema.mutate(kwargs)
+            query=self._query.mutate(new_schema=new_schema, **mutated),
+            signal_schema=new_schema,
         )
     @property
     def _effective_signals_schema(self) -> "SignalSchema":
-        """Effective schema used for user-facing API like collect, to_pandas, etc."""
+        """Effective schema used for user-facing API like to_list, to_pandas, etc."""
         signals_schema = self.signals_schema
         if not self._sys:
             return signals_schema.clone_without_sys_signals()
         return signals_schema
     @overload
-    def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
+    def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
     @overload
-    def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
+    def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
     @overload
-    def collect_flatten(
+    def _leaf_values(
         self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
     ) -> Iterator[_T]: ...
     @overload
-    def collect_flatten(
+    def _leaf_values(
         self,
         *,
         row_factory: Callable[[list[str], tuple[Any, ...]], _T],
         include_hidden: bool,
     ) -> Iterator[_T]: ...
-    def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
+    def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
         """Yields flattened rows of values as a tuple.
         Args:
-            row_factory : A callable to convert row to a custom format.
-                          It should accept two arguments: a list of column names and
-                          a tuple of row values.
+            row_factory: A callable to convert row to a custom format.
+                It should accept two arguments: a list of column names and
+                a tuple of row values.
             include_hidden: Whether to include hidden signals from the schema.
         """
         db_signals = self._effective_signals_schema.db_signals(
             include_hidden=include_hidden
         )
         with self._query.ordered_select(*db_signals).as_iterable() as rows:
             if row_factory:
                 rows = (row_factory(db_signals, r) for r in rows)  # type: ignore[assignment]
@@ -985,7 +1492,7 @@ class DataChain:
         headers, _ = self._effective_signals_schema.get_headers_with_length()
         column_names = [".".join(filter(None, header)) for header in headers]
-        results_iter = self.collect_flatten()
+        results_iter = self._leaf_values()
         def column_chunks() -> Iterator[list[list[Any]]]:
             for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1018,55 +1525,51 @@ class DataChain:
     def results(self, *, row_factory=None, include_hidden=True):
         if row_factory is None:
-            return list(self.collect_flatten(include_hidden=include_hidden))
+            return list(self._leaf_values(include_hidden=include_hidden))
         return list(
-            self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
+            self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
         )
     def to_records(self) -> list[dict[str, Any]]:
         """Convert every row to a dictionary."""
         def to_dict(cols: list[str], row: tuple[Any, ...]) -> dict[str, Any]:
-            return dict(zip(cols, row))
+            return dict(zip(cols, row, strict=False))
         return self.results(row_factory=to_dict)
-    @overload
-    def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
-    @overload
-    def collect(self, col: str) -> Iterator[DataValue]: ...
-    @overload
-    def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
-    def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]:  # type: ignore[overload-overlap,misc]
+    def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
         """Yields rows of values, optionally limited to the specified columns.
         Args:
             *cols: Limit to the specified columns. By default, all columns are selected.
         Yields:
-            (DataType): Yields a single item if a column is selected.
-            (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
-                selected.
+            (tuple[DataType, ...]): Yields a tuple of items for each row.
         Example:
             Iterating over all rows:
             ```py
-            for row in dc.collect():
+            for row in ds.to_iter():
+                print(row)
+            ```
+            DataChain is iterable and can be used in a for loop directly which is
+            equivalent to `ds.to_iter()`:
+            ```py
+            for row in ds:
                 print(row)
             ```
             Iterating over all rows with selected columns:
             ```py
-            for name, size in dc.collect("file.name", "file.size"):
+            for name, size in ds.to_iter("file.path", "file.size"):
                 print(name, size)
             ```
             Iterating over a single column:
             ```py
-            for file in dc.collect("file.name"):
+            for (file,) in ds.to_iter("file.path"):
                 print(file)
             ```
         """
@@ -1078,7 +1581,31 @@ class DataChain:
                 ret = signals_schema.row_to_features(
                     row, catalog=chain.session.catalog, cache=chain._settings.cache
                 )
-                yield ret[0] if len(cols) == 1 else tuple(ret)
+                yield tuple(ret)
+    @overload
+    def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
+    @overload
+    def collect(self, col: str) -> Iterator[DataValue]: ...
+    @overload
+    def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
+    def collect(self, *cols: str) -> Iterator[DataValue | tuple[DataValue, ...]]:  # type: ignore[overload-overlap,misc]
+        """
+        Deprecated. Use `to_iter` method instead.
+        """
+        warnings.warn(
+            "Method `collect` is deprecated. Use `to_iter` method instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if len(cols) == 1:
+            yield from [item[0] for item in self.to_iter(*cols)]
+        else:
+            yield from self.to_iter(*cols)
     def to_pytorch(
         self,
@@ -1112,7 +1639,7 @@ class DataChain:
         if self._query.attached:
             chain = self
         else:
-            chain = self.save()
+            chain = self.persist()
         assert chain.name is not None  # for mypy
         return PytorchDataset(
             chain.name,
@@ -1126,15 +1653,12 @@ class DataChain:
             remove_prefetched=remove_prefetched,
         )
-    def remove_file_signals(self) -> "Self":
-        schema = self.signals_schema.clone_without_file_signals()
-        return self.select(*schema.values.keys())
+    @delta_disabled
     def merge(
         self,
         right_ds: "DataChain",
-        on: Union[MergeColType, Sequence[MergeColType]],
-        right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
+        on: MergeColType | Sequence[MergeColType],
+        right_on: MergeColType | Sequence[MergeColType] | None = None,
         inner=False,
         full=False,
         rname="right_",
@@ -1202,8 +1726,8 @@ class DataChain:
         def _resolve(
             ds: DataChain,
-            col: Union[str, Function, sqlalchemy.ColumnElement],
-            side: Union[str, None],
+            col: str | Function | sqlalchemy.ColumnElement,
+            side: str | None,
         ):
             try:
                 if isinstance(col, Function):
@@ -1216,7 +1740,7 @@ class DataChain:
         ops = [
             _resolve(self, left, "left")
             == _resolve(right_ds, right, "right" if right_on else None)
-            for left, right in zip(on, right_on or on)
+            for left, right in zip(on, right_on or on, strict=False)
         ]
         if errors:
@@ -1225,32 +1749,44 @@ class DataChain:
             )
         query = self._query.join(
-            right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
+            right_ds._query, sqlalchemy.and_(*ops), inner, full, rname
         )
         query.feature_schema = None
         ds = self._evolve(query=query)
+        # Note: merge drops sys signals from both sides, make sure to not include it
+        # in the resulting schema
         signals_schema = self.signals_schema.clone_without_sys_signals()
         right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
-        ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
-            right_signals_schema, rname
-        )
+        ds.signals_schema = signals_schema.merge(right_signals_schema, rname)
         return ds
+    @delta_disabled
     def union(self, other: "Self") -> "Self":
         """Return the set union of the two datasets.
         Parameters:
             other: chain whose rows will be added to `self`.
         """
+        self_schema = self.signals_schema
+        other_schema = other.signals_schema
+        missing_left, missing_right = self_schema.compare_signals(other_schema)
+        if missing_left or missing_right:
+            raise UnionSchemaMismatchError.from_column_sets(
+                missing_left,
+                missing_right,
+            )
+        self.signals_schema = self_schema.clone_without_sys_signals()
         return self._evolve(query=self._query.union(other._query))
     def subtract(  # type: ignore[override]
         self,
         other: "DataChain",
-        on: Optional[Union[str, Sequence[str]]] = None,
-        right_on: Optional[Union[str, Sequence[str]]] = None,
+        on: str | Sequence[str] | None = None,
+        right_on: str | Sequence[str] | None = None,
     ) -> "Self":
         """Remove rows that appear in another chain.
@@ -1307,58 +1843,51 @@ class DataChain:
                 zip(
                     self.signals_schema.resolve(*on).db_signals(),
                     other.signals_schema.resolve(*right_on).db_signals(),
+                    strict=False,
                 )  # type: ignore[arg-type]
             )
         return self._evolve(query=self._query.subtract(other._query, signals))  # type: ignore[arg-type]
-    def compare(
+    def diff(
         self,
         other: "DataChain",
-        on: Union[str, Sequence[str]],
-        right_on: Optional[Union[str, Sequence[str]]] = None,
-        compare: Optional[Union[str, Sequence[str]]] = None,
-        right_compare: Optional[Union[str, Sequence[str]]] = None,
+        on: str | Sequence[str],
+        right_on: str | Sequence[str] | None = None,
+        compare: str | Sequence[str] | None = None,
+        right_compare: str | Sequence[str] | None = None,
         added: bool = True,
         deleted: bool = True,
         modified: bool = True,
         same: bool = False,
-        status_col: Optional[str] = None,
+        status_col: str | None = None,
     ) -> "DataChain":
-        """Comparing two chains by identifying rows that are added, deleted, modified
-        or same. Result is the new chain that has additional column with possible
-        values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
-        rows respectively. Note that if only one "status" is asked, by setting proper
-        flags, this additional column is not created as it would have only one value
-        for all rows. Beside additional diff column, new chain has schema of the chain
-        on which method was called.
+        """Calculate differences between two chains.
+        This method identifies records that are added, deleted, modified, or unchanged
+        between two chains. It adds a status column with values: A=added, D=deleted,
+        M=modified, S=same.
         Parameters:
-            other: Chain to calculate diff from.
-            on: Column or list of columns to match on. If both chains have the
-                same columns then this column is enough for the match. Otherwise,
-                `right_on` parameter has to specify the columns for the other chain.
-                This value is used to find corresponding row in other dataset. If not
-                found there, row is considered as added (or removed if vice versa), and
-                if found then row can be either modified or same.
-            right_on: Optional column or list of columns
-                for the `other` to match.
-            compare: Column or list of columns to compare on. If both chains have
-                the same columns then this column is enough for the compare. Otherwise,
-                `right_compare` parameter has to specify the columns for the other
-                chain. This value is used to see if row is modified or same. If
-                not set, all columns will be used for comparison
-            right_compare: Optional column or list of columns
-                    for the `other` to compare to.
-            added (bool): Whether to return added rows in resulting chain.
-            deleted (bool): Whether to return deleted rows in resulting chain.
-            modified (bool): Whether to return modified rows in resulting chain.
-            same (bool): Whether to return unchanged rows in resulting chain.
-            status_col (str): Name of the new column that is created in resulting chain
-                representing diff status.
+            other: Chain to compare against.
+            on: Column(s) to match records between chains.
+            right_on: Column(s) in the other chain to match against. Defaults to `on`.
+            compare: Column(s) to check for changes.
+                     If not specified,all columns are used.
+            right_compare: Column(s) in the other chain to compare against.
+                     Defaults to values of `compare`.
+            added (bool): Include records that exist in this chain but not in the other.
+            deleted (bool): Include records that exist only in the other chain.
+            modified (bool): Include records that exist in both
+                     but have different values.
+            same (bool): Include records that are identical in both chains.
+            status_col (str): Name for the status column showing differences.
+        Default behavior: By default, shows added, deleted, and modified records,
+        but excludes unchanged records (same=False). Status column is not created.
         Example:
             ```py
-            res = persons.compare(
+            res = persons.diff(
                 new_persons,
                 on=["id"],
                 right_on=["other_id"],
@@ -1387,42 +1916,40 @@ class DataChain:
             status_col=status_col,
         )
-    def diff(
+    def file_diff(
         self,
         other: "DataChain",
         on: str = "file",
-        right_on: Optional[str] = None,
+        right_on: str | None = None,
         added: bool = True,
         modified: bool = True,
         deleted: bool = False,
         same: bool = False,
-        status_col: Optional[str] = None,
+        status_col: str | None = None,
     ) -> "DataChain":
-        """Similar to `.compare()`, which is more generic method to calculate difference
-        between two chains. Unlike `.compare()`, this method works only on those chains
-        that have `File` object, or it's derivatives, in it. File `source` and `path`
-        are used for matching, and file `version` and `etag` for comparing, while in
-        `.compare()` user needs to provide arbitrary columns for matching and comparing.
+        """Calculate differences between two chains containing files.
+        This method is specifically designed for file chains. It uses file `source`
+        and `path` to match files, and file `version` and `etag` to detect changes.
         Parameters:
-            other: Chain to calculate diff from.
-            on: File signal to match on. If both chains have the
-                same file signal then this column is enough for the match. Otherwise,
-                `right_on` parameter has to specify the file signal for the other chain.
-                This value is used to find corresponding row in other dataset. If not
-                found there, row is considered as added (or removed if vice versa), and
-                if found then row can be either modified or same.
-            right_on: Optional file signal for the `other` to match.
-            added (bool): Whether to return added rows in resulting chain.
-            deleted (bool): Whether to return deleted rows in resulting chain.
-            modified (bool): Whether to return modified rows in resulting chain.
-            same (bool): Whether to return unchanged rows in resulting chain.
-            status_col (str): Optional name of the new column that is created in
-                resulting chain representing diff status.
+            other: Chain to compare against.
+            on: File column name in this chain. Default is "file".
+            right_on: File column name in the other chain. Defaults to `on`.
+            added (bool): Include files that exist in this chain but not in the other.
+            deleted (bool): Include files that exist only in the other chain.
+            modified (bool): Include files that exist in both but have different
+                             versions/etags.
+            same (bool): Include files that are identical in both chains.
+            status_col (str): Name for the status column showing differences
+                              (A=added, D=deleted, M=modified, S=same).
+        Default behavior: By default, includes only new files (added=True and
+        modified=True). This is useful for incremental processing.
         Example:
             ```py
-            diff = images.diff(
+            diff = images.file_diff(
                 new_images,
                 on="file",
                 right_on="other_file",
@@ -1447,7 +1974,7 @@ class DataChain:
         compare_cols = get_file_signals(on, compare_file_signals)
         right_compare_cols = get_file_signals(right_on, compare_file_signals)
-        return self.compare(
+        return self.diff(
             other,
             on_cols,
             right_on=right_on_cols,
@@ -1492,47 +2019,67 @@ class DataChain:
         )
         return read_pandas(*args, **kwargs)
-    def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
+    def to_pandas(
+        self,
+        flatten: bool = False,
+        include_hidden: bool = True,
+        as_object: bool = False,
+    ) -> "pd.DataFrame":
         """Return a pandas DataFrame from the chain.
         Parameters:
-            flatten : Whether to use a multiindex or flatten column names.
-            include_hidden : Whether to include hidden columns.
+            flatten: Whether to use a multiindex or flatten column names.
+            include_hidden: Whether to include hidden columns.
+            as_object: Whether to emit a dataframe backed by Python objects
+                rather than pandas-inferred dtypes.
+        Returns:
+            pd.DataFrame: A pandas DataFrame representation of the chain.
         """
         import pandas as pd
         headers, max_length = self._effective_signals_schema.get_headers_with_length(
             include_hidden=include_hidden
         )
+        columns: list[str] | pd.MultiIndex
         if flatten or max_length < 2:
             columns = [".".join(filter(None, header)) for header in headers]
         else:
             columns = pd.MultiIndex.from_tuples(map(tuple, headers))
         results = self.results(include_hidden=include_hidden)
+        if as_object:
+            df = pd.DataFrame(results, columns=columns, dtype=object)
+            df.where(pd.notna(df), None, inplace=True)
+            return df
         return pd.DataFrame.from_records(results, columns=columns)
     def show(
         self,
         limit: int = 20,
-        flatten=False,
-        transpose=False,
-        truncate=True,
-        include_hidden=False,
+        flatten: bool = False,
+        transpose: bool = False,
+        truncate: bool = True,
+        include_hidden: bool = False,
     ) -> None:
         """Show a preview of the chain results.
         Parameters:
-            limit : How many rows to show.
-            flatten : Whether to use a multiindex or flatten column names.
-            transpose : Whether to transpose rows and columns.
-            truncate : Whether or not to truncate the contents of columns.
-            include_hidden : Whether to include hidden columns.
+            limit: How many rows to show.
+            flatten: Whether to use a multiindex or flatten column names.
+            transpose: Whether to transpose rows and columns.
+            truncate: Whether or not to truncate the contents of columns.
+            include_hidden: Whether to include hidden columns.
         """
         import pandas as pd
         dc = self.limit(limit) if limit > 0 else self  # type: ignore[misc]
-        df = dc.to_pandas(flatten, include_hidden=include_hidden)
+        df = dc.to_pandas(
+            flatten,
+            include_hidden=include_hidden,
+            as_object=True,
+        )
         if df.empty:
             print("Empty result")
@@ -1588,23 +2135,23 @@ class DataChain:
     def parse_tabular(
         self,
         output: OutputType = None,
-        object_name: str = "",
+        column: str = "",
         model_name: str = "",
         source: bool = True,
-        nrows: Optional[int] = None,
-        **kwargs,
+        nrows: int | None = None,
+        **kwargs: Any,
     ) -> "Self":
         """Generate chain from list of tabular files.
         Parameters:
-            output : Dictionary or feature class defining column names and their
+            output: Dictionary or feature class defining column names and their
                 corresponding types. List of column names is also accepted, in which
                 case types will be inferred.
-            object_name : Generated object column name.
-            model_name : Generated model name.
-            source : Whether to include info about the source file.
-            nrows : Optional row limit.
-            kwargs : Parameters to pass to pyarrow.dataset.dataset.
+            column: Generated column name.
+            model_name: Generated model name.
+            source: Whether to include info about the source file.
+            nrows: Optional row limit.
+            kwargs: Parameters to pass to pyarrow.dataset.dataset.
         Example:
             Reading a json lines file:
@@ -1619,24 +2166,33 @@ class DataChain:
             import datachain as dc
             chain = dc.read_storage("s3://mybucket")
-            chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
+            chain = chain.filter(dc.C("file.path").glob("*.jsonl"))
             chain = chain.parse_tabular(format="json")
             ```
         """
         from pyarrow.dataset import CsvFileFormat, JsonFileFormat
-        from datachain.lib.arrow import ArrowGenerator, infer_schema, schema_to_output
+        from datachain.lib.arrow import (
+            ArrowGenerator,
+            fix_pyarrow_format,
+            infer_schema,
+            schema_to_output,
+        )
-        if nrows:
-            format = kwargs.get("format")
-            if format not in ["csv", "json"] and not isinstance(
-                format, (CsvFileFormat, JsonFileFormat)
-            ):
-                raise DatasetPrepareError(
-                    self.name,
-                    "error in `parse_tabular` - "
-                    "`nrows` only supported for csv and json formats.",
-                )
+        parse_options = kwargs.pop("parse_options", None)
+        if format := kwargs.get("format"):
+            kwargs["format"] = fix_pyarrow_format(format, parse_options)
+        if (
+            nrows
+            and format not in ["csv", "json"]
+            and not isinstance(format, (CsvFileFormat, JsonFileFormat))
+        ):
+            raise DatasetPrepareError(
+                self.name,
+                "error in `parse_tabular` - "
+                "`nrows` only supported for csv and json formats.",
+            )
         if "file" not in self.schema or not self.count():
             raise DatasetPrepareError(self.name, "no files to parse.")
@@ -1645,20 +2201,20 @@ class DataChain:
         col_names = output if isinstance(output, Sequence) else None
         if col_names or not output:
             try:
-                schema = infer_schema(self, **kwargs)
+                schema = infer_schema(self, **kwargs, parse_options=parse_options)
                 output, _ = schema_to_output(schema, col_names)
             except ValueError as e:
                 raise DatasetPrepareError(self.name, e) from e
         if isinstance(output, dict):
-            model_name = model_name or object_name or ""
+            model_name = model_name or column or ""
             model = dict_to_data_model(model_name, output)
             output = model
         else:
             model = output  # type: ignore[assignment]
-        if object_name:
-            output = {object_name: model}  # type: ignore[dict-item]
+        if column:
+            output = {column: model}  # type: ignore[dict-item]
         elif isinstance(output, type(BaseModel)):
             output = {
                 name: info.annotation  # type: ignore[misc]
@@ -1671,7 +2227,15 @@ class DataChain:
         # disable prefetch if nrows is set
         settings = {"prefetch": 0} if nrows else {}
         return self.settings(**settings).gen(  # type: ignore[arg-type]
-            ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
+            ArrowGenerator(
+                schema,
+                model,
+                source,
+                nrows,
+                parse_options=parse_options,
+                **kwargs,
+            ),
+            output=output,
         )
     @classmethod
@@ -1708,23 +2272,23 @@ class DataChain:
     def to_parquet(
         self,
-        path: Union[str, os.PathLike[str], BinaryIO],
-        partition_cols: Optional[Sequence[str]] = None,
+        path: str | os.PathLike[str] | BinaryIO,
+        partition_cols: Sequence[str] | None = None,
         chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
-        fs_kwargs: Optional[dict[str, Any]] = None,
+        fs_kwargs: dict[str, Any] | None = None,
         **kwargs,
     ) -> None:
         """Save chain to parquet file with SignalSchema metadata.
         Parameters:
-            path : Path or a file-like binary object to save the file. This supports
+            path: Path or a file-like binary object to save the file. This supports
                 local paths as well as remote paths, such as s3:// or hf:// with fsspec.
-            partition_cols : Column names by which to partition the dataset.
-            chunk_size : The chunk size of results to read and convert to columnar
+            partition_cols: Column names by which to partition the dataset.
+            chunk_size: The chunk size of results to read and convert to columnar
                 data, to avoid running out of memory.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
         """
         import pyarrow as pa
         import pyarrow.parquet as pq
@@ -1754,9 +2318,9 @@ class DataChain:
             fsspec_fs = client.create_fs(**fs_kwargs)
         _partition_cols = list(partition_cols) if partition_cols else None
-        signal_schema_metadata = orjson.dumps(
-            self._effective_signals_schema.serialize()
-        )
+        signal_schema_metadata = json.dumps(
+            self._effective_signals_schema.serialize(), ensure_ascii=False
+        ).encode("utf-8")
         column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
@@ -1768,7 +2332,7 @@ class DataChain:
             # pyarrow infers the best parquet schema from the python types of
             # the input data.
             table = pa.Table.from_pydict(
-                dict(zip(column_names, chunk)),
+                dict(zip(column_names, chunk, strict=False)),
                 schema=parquet_schema,
             )
@@ -1806,123 +2370,220 @@ class DataChain:
     def to_csv(
         self,
-        path: Union[str, os.PathLike[str]],
+        path: str | os.PathLike[str],
         delimiter: str = ",",
-        fs_kwargs: Optional[dict[str, Any]] = None,
+        fs_kwargs: dict[str, Any] | None = None,
         **kwargs,
-    ) -> None:
-        """Save chain to a csv (comma-separated values) file.
+    ) -> File:
+        """Save chain to a csv (comma-separated values) file and return the stored
+        `File`.
         Parameters:
-            path : Path to save the file. This supports local paths as well as
+            path: Path to save the file. This supports local paths as well as
                 remote paths, such as s3:// or hf:// with fsspec.
-            delimiter : Delimiter to use for the resulting file.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
+            delimiter: Delimiter to use for the resulting file.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
+        Returns:
+            File: The stored file with refreshed metadata (version, etag, size).
         """
         import csv
-        opener = open
-        if isinstance(path, str) and "://" in path:
-            from datachain.client.fsspec import Client
-            fs_kwargs = {
-                **self._query.catalog.client_config,
-                **(fs_kwargs or {}),
-            }
-            client = Client.get_implementation(path)
-            fsspec_fs = client.create_fs(**fs_kwargs)
-            opener = fsspec_fs.open
+        target = File.at(path, session=self.session)
         headers, _ = self._effective_signals_schema.get_headers_with_length()
         column_names = [".".join(filter(None, header)) for header in headers]
-        results_iter = self.collect_flatten()
-        with opener(path, "w", newline="") as f:
+        with target.open("w", newline="", client_config=fs_kwargs) as f:
             writer = csv.writer(f, delimiter=delimiter, **kwargs)
             writer.writerow(column_names)
-            for row in results_iter:
+            for row in self._leaf_values():
                 writer.writerow(row)
+        return target
     def to_json(
         self,
-        path: Union[str, os.PathLike[str]],
-        fs_kwargs: Optional[dict[str, Any]] = None,
+        path: str | os.PathLike[str],
+        fs_kwargs: dict[str, Any] | None = None,
         include_outer_list: bool = True,
-    ) -> None:
-        """Save chain to a JSON file.
+    ) -> File:
+        """Save chain to a JSON file and return the stored `File`.
         Parameters:
-            path : Path to save the file. This supports local paths as well as
+            path: Path to save the file. This supports local paths as well as
                 remote paths, such as s3:// or hf:// with fsspec.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
-            include_outer_list : Sets whether to include an outer list for all rows.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
+            include_outer_list: Sets whether to include an outer list for all rows.
                 Setting this to True makes the file valid JSON, while False instead
                 writes in the JSON lines format.
+        Returns:
+            File: The stored file with refreshed metadata (version, etag, size).
         """
-        opener = open
-        if isinstance(path, str) and "://" in path:
-            from datachain.client.fsspec import Client
-            fs_kwargs = {
-                **self._query.catalog.client_config,
-                **(fs_kwargs or {}),
-            }
-            client = Client.get_implementation(path)
-            fsspec_fs = client.create_fs(**fs_kwargs)
-            opener = fsspec_fs.open
+        target = File.at(path, session=self.session)
         headers, _ = self._effective_signals_schema.get_headers_with_length()
-        headers = [list(filter(None, header)) for header in headers]
+        headers = [list(filter(None, h)) for h in headers]
+        with target.open("wb", client_config=fs_kwargs) as f:
+            self._write_json_stream(f, headers, include_outer_list)
+        return target
+    def _write_json_stream(
+        self,
+        f: IO[bytes],
+        headers: list[list[str]],
+        include_outer_list: bool,
+    ) -> None:
         is_first = True
-        with opener(path, "wb") as f:
-            if include_outer_list:
-                # This makes the file JSON instead of JSON lines.
-                f.write(b"[\n")
-            for row in self.collect_flatten():
-                if not is_first:
-                    if include_outer_list:
-                        # This makes the file JSON instead of JSON lines.
-                        f.write(b",\n")
-                    else:
-                        f.write(b"\n")
-                else:
-                    is_first = False
-                f.write(orjson.dumps(row_to_nested_dict(headers, row)))
-            if include_outer_list:
-                # This makes the file JSON instead of JSON lines.
-                f.write(b"\n]\n")
+        if include_outer_list:
+            f.write(b"[\n")
+        for row in self._leaf_values():
+            if not is_first:
+                f.write(b",\n" if include_outer_list else b"\n")
+            else:
+                is_first = False
+            f.write(
+                json.dumps(
+                    row_to_nested_dict(headers, row),
+                    ensure_ascii=False,
+                ).encode("utf-8")
+            )
+        if include_outer_list:
+            f.write(b"\n]\n")
     def to_jsonl(
         self,
-        path: Union[str, os.PathLike[str]],
-        fs_kwargs: Optional[dict[str, Any]] = None,
-    ) -> None:
+        path: str | os.PathLike[str],
+        fs_kwargs: dict[str, Any] | None = None,
+    ) -> File:
         """Save chain to a JSON lines file.
         Parameters:
-            path : Path to save the file. This supports local paths as well as
+            path: Path to save the file. This supports local paths as well as
                 remote paths, such as s3:// or hf:// with fsspec.
-            fs_kwargs : Optional kwargs to pass to the fsspec filesystem, used only for
-                write, for fsspec-type URLs, such as s3:// or hf:// when
-                provided as the destination path.
+            fs_kwargs: Optional kwargs forwarded to the underlying fsspec filesystem
+                when writing (e.g., s3://, gs://, hf://), fsspec-specific options
+                are supported.
+        Returns:
+            File: The stored file with refreshed metadata (version, etag, size).
         """
-        self.to_json(path, fs_kwargs, include_outer_list=False)
+        return self.to_json(path, fs_kwargs, include_outer_list=False)
+    def to_database(
+        self,
+        table_name: str,
+        connection: "ConnectionType",
+        *,
+        batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
+        on_conflict: str | None = None,
+        conflict_columns: list[str] | None = None,
+        column_mapping: dict[str, str | None] | None = None,
+    ) -> int:
+        """Save chain to a database table using a given database connection.
+        This method exports all DataChain records to a database table, creating the
+        table if it doesn't exist and appending data if it does. The table schema
+        is automatically inferred from the DataChain's signal schema.
+        For PostgreSQL, tables are created in the schema specified by the connection's
+        search_path (defaults to 'public'). Use URL parameters to target specific
+        schemas.
+        Parameters:
+            table_name: Name of the database table to create/write to.
+            connection: SQLAlchemy connectable, str, or a sqlite3 connection
+                Using SQLAlchemy makes it possible to use any DB supported by that
+                library. If a DBAPI2 object, only sqlite3 is supported. The user is
+                responsible for engine disposal and connection closure for the
+                SQLAlchemy connectable; str connections are closed automatically.
+            batch_size: Number of rows to insert per batch for optimal performance.
+                Larger batches are faster but use more memory. Default: 10,000.
+            on_conflict: Strategy for handling duplicate rows (requires table
+                constraints):
+                - None: Raise error (`sqlalchemy.exc.IntegrityError`) on conflict
+                  (default)
+                - "ignore": Skip duplicate rows silently
+                - "update": Update existing rows with new values
+            conflict_columns: List of column names that form a unique constraint
+                for conflict resolution. Required when on_conflict='update' and
+                using PostgreSQL.
+            column_mapping: Optional mapping to rename or skip columns:
+                - Dict mapping DataChain column names to database column names
+                - Set values to None to skip columns entirely, or use `defaultdict` to
+                  skip all columns except those specified.
+        Returns:
+            int: Number of rows affected (inserted/updated). -1 if DB driver doesn't
+                 support telemetry.
+        Examples:
+            Basic usage with PostgreSQL:
+            ```py
+            import datachain as dc
+            rows_affected = (dc
+              .read_storage("s3://my-bucket/")
+              .to_database("files_table", "postgresql://user:pass@localhost/mydb")
+            )
+            print(f"Inserted/updated {rows_affected} rows")
+            ```
+            Using SQLite with connection string:
+            ```py
+            rows_affected = chain.to_database("my_table", "sqlite:///data.db")
+            print(f"Affected {rows_affected} rows")
+            ```
+            Column mapping and renaming:
+            ```py
+            mapping = {
+                "user.id": "id",
+                "user.name": "name",
+                "user.password": None  # Skip this column
+            }
+            chain.to_database("users", engine, column_mapping=mapping)
+            ```
+            Handling conflicts (requires PRIMARY KEY or UNIQUE constraints):
+            ```py
+            # Skip duplicates
+            chain.to_database("my_table", engine, on_conflict="ignore")
+            # Update existing records
+            chain.to_database(
+               "my_table", engine, on_conflict="update", conflict_columns=["id"]
+            )
+            ```
+            Working with different databases:
+            ```py
+            # MySQL
+            mysql_engine = sa.create_engine("mysql+pymysql://user:pass@host/db")
+            chain.to_database("mysql_table", mysql_engine)
+            # SQLite in-memory
+            chain.to_database("temp_table", "sqlite:///:memory:")
+            ```
+            PostgreSQL with schema support:
+            ```py
+            pg_url = "postgresql://user:pass@host/db?options=-c search_path=analytics"
+            chain.to_database("processed_data", pg_url)
+            ```
+        """
+        from .database import to_database
+        return to_database(
+            self,
+            table_name,
+            connection,
+            batch_size=batch_size,
+            on_conflict=on_conflict,
+            conflict_columns=conflict_columns,
+            column_mapping=column_mapping,
+        )
     @classmethod
     def from_records(
@@ -1940,28 +2601,85 @@ class DataChain:
         )
         return read_records(*args, **kwargs)
-    def sum(self, fr: DataType):  # type: ignore[override]
-        """Compute the sum of a column."""
-        return self._extend_to_data_model("sum", fr)
+    def sum(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the sum of a column.
+        Parameters:
+            col: The column to compute the sum for.
+        Returns:
+            The sum of the column values.
+        Example:
+            ```py
+            total_size = chain.sum("file.size")
+            print(f"Total size: {total_size}")
+            ```
+        """
+        return self._extend_to_data_model("sum", col)
+    def avg(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the average of a column.
+        Parameters:
+            col: The column to compute the average for.
+        Returns:
+            The average of the column values.
+        Example:
+            ```py
+            average_size = chain.avg("file.size")
+            print(f"Average size: {average_size}")
+            ```
+        """
+        return self._extend_to_data_model("avg", col)
+    def min(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the minimum of a column.
-    def avg(self, fr: DataType):  # type: ignore[override]
-        """Compute the average of a column."""
-        return self._extend_to_data_model("avg", fr)
+        Parameters:
+            col: The column to compute the minimum for.
-    def min(self, fr: DataType):  # type: ignore[override]
-        """Compute the minimum of a column."""
-        return self._extend_to_data_model("min", fr)
+        Returns:
+            The minimum value in the column.
-    def max(self, fr: DataType):  # type: ignore[override]
-        """Compute the maximum of a column."""
-        return self._extend_to_data_model("max", fr)
+        Example:
+            ```py
+            min_size = chain.min("file.size")
+            print(f"Minimum size: {min_size}")
+            ```
+        """
+        return self._extend_to_data_model("min", col)
+    def max(self, col: str) -> StandardType:  # type: ignore[override]
+        """Compute the maximum of a column.
+        Parameters:
+            col: The column to compute the maximum for.
+        Returns:
+            The maximum value in the column.
+        Example:
+            ```py
+            max_size = chain.max("file.size")
+            print(f"Maximum size: {max_size}")
+            ```
+        """
+        return self._extend_to_data_model("max", col)
     def setup(self, **kwargs) -> "Self":
         """Setup variables to pass to UDF functions.
-        Use before running map/gen/agg/batch_map to save an object and pass it as an
+        Use before running map/gen/agg to save an object and pass it as an
         argument to the UDF.
+        The value must be a callable (a `lambda: <value>` syntax can be used to quickly
+        create one) that returns the object to be passed to the UDF. It is evaluated
+        lazily when UDF is running, in case of multiple machines the callable is run on
+        a worker machine.
         Example:
             ```py
             import anthropic
@@ -1971,7 +2689,11 @@ class DataChain:
             (
                 dc.read_storage(DATA, type="text")
                 .settings(parallel=4, cache=True)
+                # Setup Anthropic client and pass it to the UDF below automatically
+                # The value is callable (see the note above)
                 .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
                 .map(
                     claude=lambda client, file: client.messages.create(
                         model=MODEL,
@@ -1993,13 +2715,13 @@ class DataChain:
     def to_storage(
         self,
-        output: Union[str, os.PathLike[str]],
+        output: str | os.PathLike[str],
         signal: str = "file",
         placement: FileExportPlacement = "fullpath",
         link_type: Literal["copy", "symlink"] = "copy",
-        num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
-        anon: bool = False,
-        client_config: Optional[dict] = None,
+        num_threads: int | None = EXPORT_FILES_MAX_THREADS,
+        anon: bool | None = None,
+        client_config: dict | None = None,
     ) -> None:
         """Export files from a specified signal to a directory. Files can be
         exported to a local or cloud directory.
@@ -2008,12 +2730,28 @@ class DataChain:
             output: Path to the target directory for exporting files.
             signal: Name of the signal to export files from.
             placement: The method to use for naming exported files.
-                The possible values are: "filename", "etag", "fullpath", and "checksum".
+                The possible values are: "filename", "etag", "fullpath",
+                "filepath", and "checksum".
+                Example path translations for an object located at
+                ``s3://bucket/data/img.jpg`` and exported to ``./out``:
+                - "filename" -> ``./out/img.jpg`` (no directories)
+                - "filepath" -> ``./out/data/img.jpg`` (relative path kept)
+                - "fullpath" -> ``./out/bucket/data/img.jpg`` (remote host kept)
+                - "etag" -> ``./out/<etag>.jpg`` (unique name via object digest)
+                Local sources behave like "filepath" for "fullpath" placement.
+                Relative destinations such as "." or ".." and absolute paths
+                are supported for every strategy.
             link_type: Method to use for exporting files.
                 Falls back to `'copy'` if symlinking fails.
-            num_threads : number of threads to use for exporting files.
-                By default it uses 5 threads.
-            anon: If true, we will treat cloud bucket as public one
+            num_threads: number of threads to use for exporting files.
+                By default, it uses 5 threads.
+            anon: If True, we will treat cloud bucket as a public one. Default behavior
+                depends on the previous session configuration (e.g. happens in the
+                initial `read_storage`) and particular cloud storage client
+                implementation (e.g. S3 fallbacks to anonymous access if no credentials
+                were found).
             client_config: Optional configuration for the destination storage client
         Example:
@@ -2025,21 +2763,23 @@ class DataChain:
             ds.to_storage("gs://mybucket", placement="filename")
             ```
         """
+        chain = self.persist()
+        count = chain.count()
         if placement == "filename" and (
-            self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
-            != self._query.count()
+            chain._query.distinct(pathfunc.name(C(f"{signal}__path"))).count() != count
         ):
             raise ValueError("Files with the same name found")
-        if anon:
-            client_config = (client_config or {}) | {"anon": True}
+        if anon is not None:
+            client_config = (client_config or {}) | {"anon": anon}
         progress_bar = tqdm(
             desc=f"Exporting files to {output}: ",
             unit=" files",
             unit_scale=True,
             unit_divisor=10,
-            total=self.count(),
+            total=count,
             leave=False,
         )
         file_exporter = FileExporter(
@@ -2050,20 +2790,36 @@ class DataChain:
             max_threads=num_threads or 1,
             client_config=client_config,
         )
-        file_exporter.run(self.collect(signal), progress_bar)
+        file_exporter.run(
+            (rows[0] for rows in chain.to_iter(signal)),
+            progress_bar,
+        )
     def shuffle(self) -> "Self":
-        """Shuffle the rows of the chain deterministically."""
-        return self.order_by("sys.rand")
+        """Shuffle rows with a best-effort deterministic ordering.
+        This produces repeatable shuffles. Merge and union operations can
+        lead to non-deterministic results. Use order by or save a dataset
+        afterward to guarantee the same result.
+        """
+        query = self._query.clone(new_table=False)
+        query.steps.append(RegenerateSystemColumns(self._query.catalog))
-    def sample(self, n) -> "Self":
+        chain = self._evolve(
+            query=query,
+            signal_schema=SignalSchema({"sys": Sys}) | self.signals_schema,
+        )
+        return chain.order_by("sys.rand")
+    def sample(self, n: int) -> "Self":
         """Return a random sample from the chain.
         Parameters:
-            n (int): Number of samples to draw.
+            n: Number of samples to draw.
-        NOTE: Samples are not deterministic, and streamed/paginated queries or
-        multiple workers will draw samples with replacement.
+        Note:
+            Samples are not deterministic, and streamed/paginated queries or
+            multiple workers will draw samples with replacement.
         """
         return self._evolve(query=self._query.sample(n))
@@ -2078,27 +2834,62 @@ class DataChain:
             Using glob to match patterns
             ```py
-            dc.filter(C("file.name").glob("*.jpg"))
+            dc.filter(C("file.path").glob("*.jpg"))
+            ```
+            Using in to match lists
+            ```py
+            ids = [1,2,3]
+            dc.filter(C("experiment_id").in_(ids))
             ```
             Using `datachain.func`
             ```py
             from datachain.func import string
-            dc.filter(string.length(C("file.name")) > 5)
+            dc.filter(string.length(C("file.path")) > 5)
             ```
             Combining filters with "or"
             ```py
-            dc.filter(C("file.name").glob("cat*") | C("file.name").glob("dog*))
+            dc.filter(
+                C("file.path").glob("cat*") |
+                C("file.path").glob("dog*")
+            )
+            ```
+            ```py
+            dc.filter(dc.func.or_(
+                C("file.path").glob("cat*"),
+                C("file.path").glob("dog*")
+            ))
             ```
             Combining filters with "and"
             ```py
             dc.filter(
-                C("file.name").glob("*.jpg) &
-                (string.length(C("file.name")) > 5)
+                C("file.path").glob("*.jpg"),
+                string.length(C("file.path")) > 5
+            )
+            ```
+            ```py
+            dc.filter(
+                C("file.path").glob("*.jpg") &
+                (string.length(C("file.path")) > 5)
             )
             ```
+            ```py
+            dc.filter(dc.func.and_(
+                C("file.path").glob("*.jpg"),
+                string.length(C("file.path")) > 5
+            ))
+            ```
+            Combining filters with "not"
+            ```py
+            dc.filter(~(C("file.path").glob("*.jpg")))
+            ```
         """
         return self._evolve(query=self._query.filter(*args))
@@ -2135,6 +2926,10 @@ class DataChain:
     def chunk(self, index: int, total: int) -> "Self":
         """Split a chain into smaller chunks for e.g. parallelization.
+        Parameters:
+            index: The index of the chunk (0-indexed).
+            total: The total number of chunks.
         Example:
             ```py
             import datachain as dc
@@ -2149,3 +2944,72 @@ class DataChain:
             Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
         """
         return self._evolve(query=self._query.chunk(index, total))
+    def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
+        """Returns a list of rows of values, optionally limited to the specified
+        columns.
+        Parameters:
+            *cols: Limit to the specified columns. By default, all columns are selected.
+        Returns:
+            list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
+        Example:
+            Getting all rows as a list:
+            ```py
+            rows = dc.to_list()
+            print(rows)
+            ```
+            Getting all rows with selected columns as a list:
+            ```py
+            name_size_pairs = dc.to_list("file.path", "file.size")
+            print(name_size_pairs)
+            ```
+            Getting a single column as a list:
+            ```py
+            files = dc.to_list("file.path")
+            print(files)  # Returns list of 1-tuples
+            ```
+        """
+        return list(self.to_iter(*cols))
+    def to_values(self, col: str) -> list[DataValue]:
+        """Returns a flat list of values from a single column.
+        Parameters:
+            col: The name of the column to extract values from.
+        Returns:
+            list[DataValue]: Returns a flat list of values from the specified column.
+        Example:
+            Getting all values from a single column:
+            ```py
+            file_paths = dc.to_values("file.path")
+            print(file_paths)  # Returns list of strings
+            ```
+            Getting all file sizes:
+            ```py
+            sizes = dc.to_values("file.size")
+            print(sizes)  # Returns list of integers
+            ```
+        """
+        return [row[0] for row in self.to_list(col)]
+    def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
+        """Make DataChain objects iterable.
+        Yields:
+            (tuple[DataValue, ...]): Yields tuples of all column values for each row.
+        Example:
+            ```py
+            for row in chain:
+                print(row)
+            ```
+        """
+        return self.to_iter()

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl