PyPI - datachain - Versions diffs - 0.17.1__py3-none-any.whl → 0.18.0__py3-none-any.whl - Mend

datachain 0.17.1py3-none-any.whl → 0.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (19) hide show

datachain/catalog/catalog.py +6 -0
datachain/cli/parser/job.py +7 -0
datachain/data_storage/warehouse.py +1 -1
datachain/dataset.py +7 -10
datachain/delta.py +119 -0
datachain/diff/__init__.py +10 -4
datachain/lib/dc/datachain.py +89 -2
datachain/lib/dc/datasets.py +41 -1
datachain/lib/dc/storage.py +45 -11
datachain/lib/signal_schema.py +12 -6
datachain/query/dataset.py +27 -10
datachain/remote/studio.py +2 -0
datachain/studio.py +3 -0
{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/METADATA +2 -2
{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/RECORD +19 -18
{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/WHEEL +1 -1
{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/entry_points.txt +0 -0
{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -779,6 +779,7 @@ class Catalog:
         uuid: Optional[str] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
+        update_version: Optional[str] = "patch",
     ) -> "DatasetRecord":
         """
         Creates new dataset of a specific version.
@@ -795,6 +796,11 @@ class Catalog:
         try:
             dataset = self.get_dataset(name)
             default_version = dataset.next_version_patch
+            if update_version == "major":
+                default_version = dataset.next_version_major
+            if update_version == "minor":
+                default_version = dataset.next_version_minor
             if (description or attrs) and (
                 dataset.description != description or dataset.attrs != attrs
             ):

datachain/cli/parser/job.py CHANGED Viewed

@@ -82,6 +82,13 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         nargs="+",
         help="Python package requirements",
     )
+    studio_run_parser.add_argument(
+        "--priority",
+        type=int,
+        default=5,
+        help="Priority for the job in range 0-5. "
+        "Lower value is higher priority (default: 5)",
+    )
     studio_ls_help = "List jobs in Studio"
     studio_ls_description = "List jobs in Studio."

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -258,7 +258,7 @@ class AbstractWarehouse(ABC, Serializable):
         if Client.is_data_source_uri(dataset_name):
             # for datasets that are created for bucket listing we use different prefix
             prefix = self.DATASET_SOURCE_TABLE_PREFIX
-        return f"{prefix}{dataset_name}_{version}"
+        return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
     def temp_table_name(self) -> str:
         return self.TMP_TABLE_NAME_PREFIX + _random_string(6)

datachain/dataset.py CHANGED Viewed

@@ -107,24 +107,21 @@ class DatasetDependency:
         dataset_version: Optional[str],
         dataset_version_created_at: Optional[datetime],
     ) -> Optional["DatasetDependency"]:
-        from datachain.client import Client
-        from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
+        from datachain.lib.listing import is_listing_dataset
         if not dataset_id:
             return None
         assert dataset_name is not None
-        dependency_type = DatasetDependencyType.DATASET
-        dependency_name = dataset_name
-        if is_listing_dataset(dataset_name):
-            dependency_type = DatasetDependencyType.STORAGE  # type: ignore[arg-type]
-            dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
         return cls(
             id,
-            dependency_type,
-            dependency_name,
+            (
+                DatasetDependencyType.STORAGE
+                if is_listing_dataset(dataset_name)
+                else DatasetDependencyType.DATASET
+            ),
+            dataset_name,
             (
                 dataset_version  # type: ignore[arg-type]
                 if dataset_version

datachain/delta.py ADDED Viewed

@@ -0,0 +1,119 @@
+from collections.abc import Sequence
+from copy import copy
+from functools import wraps
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+import datachain
+from datachain.dataset import DatasetDependency
+from datachain.error import DatasetNotFoundError
+if TYPE_CHECKING:
+    from typing_extensions import Concatenate, ParamSpec
+    from datachain.lib.dc import DataChain
+    P = ParamSpec("P")
+T = TypeVar("T", bound="DataChain")
+def delta_disabled(
+    method: "Callable[Concatenate[T, P], T]",
+) -> "Callable[Concatenate[T, P], T]":
+    """
+    Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
+    work with delta updates. It throws `NotImplementedError` if chain on which
+    method is called is marked as delta.
+    """
+    @wraps(method)
+    def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
+        if self.delta:
+            raise NotImplementedError(
+                f"Delta update cannot be used with {method.__name__}"
+            )
+        return method(self, *args, **kwargs)
+    return _inner
+def _append_steps(dc: "DataChain", other: "DataChain"):
+    """Returns cloned chain with appended steps from other chain.
+    Steps are all those modification methods applied like filters, mappers etc.
+    """
+    dc = dc.clone()
+    dc._query.steps += other._query.steps.copy()
+    dc.signals_schema = other.signals_schema
+    return dc
+def delta_update(
+    dc: "DataChain",
+    name: str,
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
+    """
+    Creates new chain that consists of the last version of current delta dataset
+    plus diff from the source with all needed modifications.
+    This way we don't need to re-calculate the whole chain from the source again(
+    apply all the DataChain methods like filters, mappers, generators etc.)
+    but just the diff part which is very important for performance.
+    Note that currently delta update works only if there is only one direct dependency.
+    """
+    catalog = dc.session.catalog
+    dc._query.apply_listing_pre_step()
+    try:
+        latest_version = catalog.get_dataset(name).latest_version
+    except DatasetNotFoundError:
+        # first creation of delta update dataset
+        return None, None, True
+    dependencies = catalog.get_dataset_dependencies(
+        name, latest_version, indirect=False
+    )
+    dep = dependencies[0]
+    if not dep:
+        # starting dataset (e.g listing) was removed so we are backing off to normal
+        # dataset creation, as it was created first time
+        return None, None, True
+    source_ds_name = dep.name
+    source_ds_version = dep.version
+    source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
+    dependencies = copy(dependencies)
+    dependencies = [d for d in dependencies if d is not None]  # filter out removed dep
+    dependencies[0].version = source_ds_latest_version  # type: ignore[union-attr]
+    source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
+    source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
+    diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
+    # We append all the steps from the original chain to diff, e.g filters, mappers.
+    diff = _append_steps(diff, dc)
+    # to avoid re-calculating diff multiple times
+    diff = diff.persist()
+    if diff.empty:
+        return None, None, False
+    # merging diff and the latest version of dataset
+    delta_chain = (
+        datachain.read_dataset(name, latest_version)
+        .compare(
+            diff,
+            on=right_on or on,
+            added=True,
+            modified=False,
+            deleted=False,
+        )
+        .union(diff)
+    )
+    return delta_chain, dependencies, True  # type: ignore[return-value]

datachain/diff/__init__.py CHANGED Viewed

@@ -30,7 +30,7 @@ class CompareStatus(str, Enum):
     SAME = "S"
-def _compare(  # noqa: C901
+def _compare(  # noqa: C901, PLR0912
     left: "DataChain",
     right: "DataChain",
     on: Union[str, Sequence[str]],
@@ -77,14 +77,16 @@ def _compare(  # noqa: C901
     cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
     # getting correct on and right_on column names
+    on_ = on
     on = left.signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
-    right_on = right.signals_schema.resolve(*(right_on or on)).db_signals()  # type: ignore[assignment]
+    right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals()  # type: ignore[assignment]
     # getting correct compare and right_compare column names if they are defined
     if compare:
+        compare_ = compare
         compare = left.signals_schema.resolve(*compare).db_signals()  # type: ignore[assignment]
         right_compare = right.signals_schema.resolve(
-            *(right_compare or compare)
+            *(right_compare or compare_)
         ).db_signals()  # type: ignore[assignment]
     elif not compare and len(cols) != len(right_cols):
         # here we will mark all rows that are not added or deleted as modified since
@@ -155,7 +157,11 @@ def _compare(  # noqa: C901
     if status_col:
         cols_select.append(diff_col)
-    dc_diff = dc_diff.select(*cols_select)
+    if not dc_diff._sys:
+        # TODO workaround when sys signal is not available in diff
+        dc_diff = dc_diff.settings(sys=True).select(*cols_select).settings(sys=False)
+    else:
+        dc_diff = dc_diff.select(*cols_select)
     # final schema is schema from the left chain with status column added if needed
     dc_diff.signals_schema = (

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -25,6 +25,7 @@ from tqdm import tqdm
 from datachain import semver
 from datachain.dataset import DatasetRecord
+from datachain.delta import delta_disabled, delta_update
 from datachain.func import literal
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -72,6 +73,9 @@ if TYPE_CHECKING:
     P = ParamSpec("P")
+T = TypeVar("T", bound="DataChain")
 class DataChain:
     """DataChain - a data structure for batch data processing and evaluation.
@@ -164,6 +168,7 @@ class DataChain:
         self.signals_schema = signal_schema
         self._setup: dict = setup or {}
         self._sys = _sys
+        self._delta = False
     def __repr__(self) -> str:
         """Return a string representation of the chain."""
@@ -177,6 +182,32 @@ class DataChain:
         self.print_schema(file=file)
         return file.getvalue()
+    def _as_delta(
+        self,
+        on: Optional[Union[str, Sequence[str]]] = None,
+        right_on: Optional[Union[str, Sequence[str]]] = None,
+        compare: Optional[Union[str, Sequence[str]]] = None,
+    ) -> "Self":
+        """Marks this chain as delta, which means special delta process will be
+        called on saving dataset for optimization"""
+        if on is None:
+            raise ValueError("'delta on' fields must be defined")
+        self._delta = True
+        self._delta_on = on
+        self._delta_result_on = right_on
+        self._delta_compare = compare
+        return self
+    @property
+    def empty(self) -> bool:
+        """Returns True if chain has zero number of rows"""
+        return not bool(self.count())
+    @property
+    def delta(self) -> bool:
+        """Returns True if this chain is ran in "delta" update mode"""
+        return self._delta
     @property
     def schema(self) -> dict[str, DataType]:
         """Get schema of the chain."""
@@ -254,9 +285,17 @@ class DataChain:
             signal_schema = copy.deepcopy(self.signals_schema)
         if _sys is None:
             _sys = self._sys
-        return type(self)(
+        chain = type(self)(
             query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
         )
+        if self.delta:
+            chain = chain._as_delta(
+                on=self._delta_on,
+                right_on=self._delta_result_on,
+                compare=self._delta_compare,
+            )
+        return chain
     def settings(
         self,
@@ -461,8 +500,9 @@ class DataChain:
         version: Optional[str] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
+        update_version: Optional[str] = "patch",
         **kwargs,
-    ) -> "Self":
+    ) -> "DataChain":
         """Save to a Dataset. It returns the chain itself.
         Parameters:
@@ -472,11 +512,52 @@ class DataChain:
             description : description of a dataset.
             attrs : attributes of a dataset. They can be without value, e.g "NLP",
                 or with a value, e.g "location=US".
+            update_version: which part of the dataset version to automatically increase.
+                Available values: `major`, `minor` or `patch`. Default is `patch`.
         """
         if version is not None:
             semver.validate(version)
+        if update_version is not None and update_version not in [
+            "patch",
+            "major",
+            "minor",
+        ]:
+            raise ValueError(
+                "update_version can have one of the following values: major, minor or"
+                " patch"
+            )
         schema = self.signals_schema.clone_without_sys_signals().serialize()
+        if self.delta and name:
+            delta_ds, dependencies, has_changes = delta_update(
+                self,
+                name,
+                on=self._delta_on,
+                right_on=self._delta_result_on,
+                compare=self._delta_compare,
+            )
+            if delta_ds:
+                return self._evolve(
+                    query=delta_ds._query.save(
+                        name=name,
+                        version=version,
+                        feature_schema=schema,
+                        dependencies=dependencies,
+                        **kwargs,
+                    )
+                )
+            if not has_changes:
+                # sources have not been changed so new version of resulting dataset
+                # would be the same as previous one. To avoid duplicating exact
+                # datasets, we won't create new version of it and we will return
+                # current latest version instead.
+                from .datasets import read_dataset
+                return read_dataset(name, **kwargs)
         return self._evolve(
             query=self._query.save(
                 name=name,
@@ -484,6 +565,7 @@ class DataChain:
                 description=description,
                 attrs=attrs,
                 feature_schema=schema,
+                update_version=update_version,
                 **kwargs,
             )
         )
@@ -601,6 +683,7 @@ class DataChain:
             signal_schema=udf_obj.output,
         )
+    @delta_disabled
     def agg(
         self,
         func: Optional[Callable] = None,
@@ -754,6 +837,7 @@ class DataChain:
         return self._evolve(query=self._query.order_by(*args))
+    @delta_disabled
     def distinct(self, arg: str, *args: str) -> "Self":  # type: ignore[override]
         """Removes duplicate rows based on uniqueness of some input column(s)
         i.e if rows are found with the same value of input column(s), only one
@@ -788,6 +872,7 @@ class DataChain:
             query=self._query.select(*columns), signal_schema=new_schema
         )
+    @delta_disabled  # type: ignore[arg-type]
     def group_by(
         self,
         *,
@@ -1146,6 +1231,7 @@ class DataChain:
         schema = self.signals_schema.clone_without_file_signals()
         return self.select(*schema.values.keys())
+    @delta_disabled
     def merge(
         self,
         right_ds: "DataChain",
@@ -1254,6 +1340,7 @@ class DataChain:
         return ds
+    @delta_disabled
     def union(self, other: "Self") -> "Self":
         """Return the set union of the two datasets.

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
 from datachain.error import DatasetVersionNotFoundError
@@ -27,6 +28,10 @@ def read_dataset(
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
+    delta: Optional[bool] = False,
+    delta_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_result_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_compare: Optional[Union[str, Sequence[str]]] = None,
 ) -> "DataChain":
     """Get data from a saved Dataset. It returns the chain itself.
     If dataset or version is not found locally, it will try to pull it from Studio.
@@ -38,6 +43,36 @@ def read_dataset(
         settings : Settings to use for the chain.
         fallback_to_studio : Try to pull dataset from Studio if not found locally.
             Default is True.
+        delta: If set to True, we optimize the creation of new dataset versions by
+            calculating the diff between the latest version of this storage and the
+            version used to create the most recent version of the resulting chain
+            dataset (the one specified in `.save()`). We then run the "diff" chain
+            using only the diff data, rather than the entire storage data, and merge
+            that diff chain with the latest version of the resulting dataset to create
+            a new version. This approach avoids applying modifications to all records
+            from storage every time, which can be an expensive operation.
+            The diff is calculated using the `DataChain.compare()` method, which
+            compares the `delta_on` fields to find matches and checks the compare
+            fields to determine if a record has changed. Note that this process only
+            considers added and modified records in storage; deleted records are not
+            removed from the new dataset version.
+            This calculation is based on the difference between the current version
+            of the source and the version used to create the dataset.
+        delta_on: A list of fields that uniquely identify rows in the source.
+            If two rows have the same values, they are considered the same (e.g., they
+            could be different versions of the same row in a versioned source).
+            This is used in the delta update to calculate the diff.
+        delta_result_on: A list of fields in the resulting dataset that correspond
+            to the `delta_on` fields from the source.
+            This is needed to identify rows that have changed in the source but are
+            already present in the current version of the resulting dataset, in order
+            to avoid including outdated versions of those rows in the new dataset.
+            We retain only the latest versions of rows to prevent duplication.
+            There is no need to define this if the `delta_on` fields are present in
+            the final dataset and have not been renamed.
+        delta_compare: A list of fields used to check if the same row has been modified
+            in the new version of the source.
+            If not defined, all fields except those defined in delta_on will be used.
     Example:
         ```py
@@ -113,7 +148,12 @@ def read_dataset(
         signals_schema |= SignalSchema.deserialize(query.feature_schema)
     else:
         signals_schema |= SignalSchema.from_column_types(query.column_types or {})
-    return DataChain(query, _settings, signals_schema)
+    chain = DataChain(query, _settings, signals_schema)
+    if delta:
+        chain = chain._as_delta(
+            on=delta_on, right_on=delta_result_on, compare=delta_compare
+        )
+    return chain
 def datasets(

datachain/lib/dc/storage.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os.path
+from collections.abc import Sequence
+from functools import reduce
 from typing import (
     TYPE_CHECKING,
     Optional,
     Union,
 )
-from datachain.error import DatasetNotFoundError
 from datachain.lib.file import (
     FileType,
     get_file_type,
@@ -33,6 +34,10 @@ def read_storage(
     column: str = "file",
     update: bool = False,
     anon: bool = False,
+    delta: Optional[bool] = False,
+    delta_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_result_on: Optional[Union[str, Sequence[str]]] = None,
+    delta_compare: Optional[Union[str, Sequence[str]]] = None,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
     """Get data from storage(s) as a list of file with all file attributes.
@@ -48,6 +53,36 @@ def read_storage(
         update : force storage reindexing. Default is False.
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
+        delta: If set to True, we optimize the creation of new dataset versions by
+            calculating the diff between the latest version of this storage and the
+            version used to create the most recent version of the resulting chain
+            dataset (the one specified in `.save()`). We then run the "diff" chain
+            using only the diff data, rather than the entire storage data, and merge
+            that diff chain with the latest version of the resulting dataset to create
+            a new version. This approach avoids applying modifications to all records
+            from storage every time, which can be an expensive operation.
+            The diff is calculated using the `DataChain.compare()` method, which
+            compares the `delta_on` fields to find matches and checks the compare
+            fields to determine if a record has changed. Note that this process only
+            considers added and modified records in storage; deleted records are not
+            removed from the new dataset version.
+            This calculation is based on the difference between the current version
+            of the source and the version used to create the dataset.
+        delta_on: A list of fields that uniquely identify rows in the source.
+            If two rows have the same values, they are considered the same (e.g., they
+            could be different versions of the same row in a versioned source).
+            This is used in the delta update to calculate the diff.
+        delta_result_on: A list of fields in the resulting dataset that correspond
+            to the `delta_on` fields from the source.
+            This is needed to identify rows that have changed in the source but are
+            already present in the current version of the resulting dataset, in order
+            to avoid including outdated versions of those rows in the new dataset.
+            We retain only the latest versions of rows to prevent duplication.
+            There is no need to define this if the `delta_on` fields are present in
+            the final dataset and have not been renamed.
+        delta_compare: A list of fields used to check if the same row has been modified
+            in the new version of the source.
+            If not defined, all fields except those defined in `delta_on` will be used.
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -107,7 +142,7 @@ def read_storage(
     if not uris:
         raise ValueError("No URIs provided")
-    storage_chain = None
+    chains = []
     listed_ds_name = set()
     file_values = []
@@ -132,11 +167,6 @@ def read_storage(
             def lst_fn(ds_name, lst_uri):
                 # disable prefetch for listing, as it pre-downloads all files
-                try:
-                    version = catalog.get_dataset(ds_name).next_version_major
-                except DatasetNotFoundError:
-                    version = None
                 (
                     read_records(
                         DataChain.DEFAULT_FILE_RECORD,
@@ -150,18 +180,18 @@ def read_storage(
                         output={f"{column}": file_type},
                     )
                     # for internal listing datasets, we always bump major version
-                    .save(ds_name, listing=True, version=version)
+                    .save(ds_name, listing=True, update_version="major")
                 )
             dc._query.set_listing_fn(
                 lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
             )
-        chain = ls(dc, list_path, recursive=recursive, column=column)
-        storage_chain = storage_chain.union(chain) if storage_chain else chain
+        chains.append(ls(dc, list_path, recursive=recursive, column=column))
         listed_ds_name.add(list_ds_name)
+    storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
     if file_values:
         file_chain = read_values(
             session=session,
@@ -176,4 +206,8 @@ def read_storage(
     assert storage_chain is not None
+    if delta:
+        storage_chain = storage_chain._as_delta(
+            on=delta_on, right_on=delta_result_on, compare=delta_compare
+        )
     return storage_chain

datachain/lib/signal_schema.py CHANGED Viewed

@@ -461,14 +461,13 @@ class SignalSchema:
                 pos += 1
         return objs
-    def contains_file(self) -> bool:
-        for type_ in self.values.values():
-            if (fr := ModelStore.to_pydantic(type_)) is not None and issubclass(
+    def get_file_signal(self) -> Optional[str]:
+        for signal_name, signal_type in self.values.items():
+            if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
                 fr, File
             ):
-                return True
-        return False
+                return signal_name
+        return None
     def slice(
         self,
@@ -705,6 +704,13 @@ class SignalSchema:
         return SignalSchema(self.values | schema_right)
+    def append(self, right: "SignalSchema") -> "SignalSchema":
+        missing_schema = {
+            key: right.values[key]
+            for key in [k for k in right.values if k not in self.values]
+        }
+        return SignalSchema(self.values | missing_schema)
     def get_signals(self, target_type: type[DataModel]) -> Iterator[str]:
         for path, type_, has_subtree, _ in self.get_flat_tree():
             if has_subtree and issubclass(type_, target_type):

datachain/query/dataset.py CHANGED Viewed

@@ -41,7 +41,7 @@ from datachain.data_storage.schema import (
     partition_col_names,
     partition_columns,
 )
-from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
+from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
 from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
@@ -166,11 +166,13 @@ class Step(ABC):
 @frozen
 class QueryStep:
+    """A query that returns all rows from specific dataset version"""
     catalog: "Catalog"
     dataset_name: str
     dataset_version: str
-    def apply(self):
+    def apply(self) -> "StepResult":
         def q(*columns):
             return sqlalchemy.select(*columns)
@@ -1127,9 +1129,14 @@ class DatasetQuery:
             self.version = version
         if is_listing_dataset(name):
-            # not setting query step yet as listing dataset might not exist at
-            # this point
-            self.list_ds_name = name
+            if version:
+                # this listing dataset should already be listed as we specify
+                # exact version
+                self._set_starting_step(self.catalog.get_dataset(name))
+            else:
+                # not setting query step yet as listing dataset might not exist at
+                # this point
+                self.list_ds_name = name
         elif fallback_to_studio and is_token_set():
             self._set_starting_step(
                 self.catalog.get_dataset_with_remote_fallback(name, version)
@@ -1205,11 +1212,8 @@ class DatasetQuery:
         """Setting listing function to be run if needed"""
         self.listing_fn = fn
-    def apply_steps(self) -> QueryGenerator:
-        """
-        Apply the steps in the query and return the resulting
-        sqlalchemy.SelectBase.
-        """
+    def apply_listing_pre_step(self) -> None:
+        """Runs listing pre-step if needed"""
         if self.list_ds_name and not self.starting_step:
             listing_ds = None
             try:
@@ -1225,6 +1229,13 @@ class DatasetQuery:
             # at this point we know what is our starting listing dataset name
             self._set_starting_step(listing_ds)  # type: ignore [arg-type]
+    def apply_steps(self) -> QueryGenerator:
+        """
+        Apply the steps in the query and return the resulting
+        sqlalchemy.SelectBase.
+        """
+        self.apply_listing_pre_step()
         query = self.clone()
         index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1687,8 +1698,10 @@ class DatasetQuery:
         name: Optional[str] = None,
         version: Optional[str] = None,
         feature_schema: Optional[dict] = None,
+        dependencies: Optional[list[DatasetDependency]] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
+        update_version: Optional[str] = "patch",
         **kwargs,
     ) -> "Self":
         """Save the query as a dataset."""
@@ -1723,6 +1736,7 @@ class DatasetQuery:
                 columns=columns,
                 description=description,
                 attrs=attrs,
+                update_version=update_version,
                 **kwargs,
             )
             version = version or dataset.latest_version
@@ -1740,6 +1754,9 @@ class DatasetQuery:
             )
             self.catalog.update_dataset_version_with_warehouse_info(dataset, version)
+            if dependencies:
+                # overriding dependencies
+                self.dependencies = {(dep.name, dep.version) for dep in dependencies}
             self._add_dependencies(dataset, version)  # type: ignore [arg-type]
         finally:
             self.cleanup()

datachain/remote/studio.py CHANGED Viewed

@@ -388,6 +388,7 @@ class StudioClient:
         python_version: Optional[str] = None,
         requirements: Optional[str] = None,
         repository: Optional[str] = None,
+        priority: Optional[int] = None,
     ) -> Response[JobData]:
         data = {
             "query": query,
@@ -399,6 +400,7 @@ class StudioClient:
             "python_version": python_version,
             "requirements": requirements,
             "repository": repository,
+            "priority": priority,
         }
         return self._send_request("datachain/job", data)

datachain/studio.py CHANGED Viewed

@@ -40,6 +40,7 @@ def process_jobs_args(args: "Namespace"):
             args.repository,
             args.req,
             args.req_file,
+            args.priority,
         )
     if args.cmd == "cancel":
@@ -266,6 +267,7 @@ def create_job(
     repository: Optional[str] = None,
     req: Optional[list[str]] = None,
     req_file: Optional[str] = None,
+    priority: Optional[int] = None,
 ):
     query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
     with open(query_file) as f:
@@ -294,6 +296,7 @@ def create_job(
         python_version=python_version,
         repository=repository,
         requirements=requirements,
+        priority=priority,
     )
     if not response.ok:
         raise DataChainError(response.message)

{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.17.1
+Version: 0.18.0
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -44,7 +44,7 @@ Requires-Dist: datamodel-code-generator>=0.25
 Requires-Dist: Pillow<12,>=10.0.0
 Requires-Dist: msgpack<2,>=1.0.4
 Requires-Dist: psutil
-Requires-Dist: huggingface_hub<0.31
+Requires-Dist: huggingface_hub
 Requires-Dist: iterative-telemetry>=0.0.10
 Requires-Dist: platformdirs
 Requires-Dist: dvc-studio-client<1,>=0.21

{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,8 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
 datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
 datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
-datachain/dataset.py,sha256=qxLIegmPF7e4uc2DrouyFtlLxWr7lChPxDt5H6dm9MM,20766
+datachain/dataset.py,sha256=F0Yk9SmyAf0RNSAEWGjOyZ4nxgMNi538FaQaLPe7bJk,20531
+datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
 datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
 datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
 datachain/listing.py,sha256=JtExYIfKMFhEIIcSSWBmaxWpoS3ben7kb692cHHm4Lo,7079
@@ -14,11 +15,11 @@ datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
 datachain/semver.py,sha256=t_3Y5OGLEthrstBwuwrf5pXVquEuRFu3ZoGe3ajfJB8,1715
-datachain/studio.py,sha256=5MSDk-pM2Na0TlK0eCStmZ47hNBvlY2WPKzCQFmBPpM,10948
+datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
 datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
-datachain/catalog/catalog.py,sha256=yQ-St_vWU9ddUoma--wc8bWgSODo1kwHyZvRwvlT4no,58469
+datachain/catalog/catalog.py,sha256=aB8IGLuvWjZVROOmOKksA0gKiLQyur9Z4GCRdjgfdXo,58725
 datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
 datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
 datachain/cli/__init__.py,sha256=eKCyqT05OMESHXCC93iQdqzusBdk1ptqZbBeaEghkgo,8344
@@ -32,7 +33,7 @@ datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibV
 datachain/cli/commands/query.py,sha256=Xzfgh14nPVH-sclqX1tpZqgfdTugw5s_44v0D33z6FA,1505
 datachain/cli/commands/show.py,sha256=Cf8wBs12h-xtdOzjU5GTDy2C8rF5HJSF0hDJYER1zH8,1606
 datachain/cli/parser/__init__.py,sha256=sjCIcosAtZqa0m50GMQHqmCkZSYxKyZNwQ29XwRQlP0,15913
-datachain/cli/parser/job.py,sha256=dpRY6KQgooCXhQGQiebPuNuGouKk5jclkpafpQ2TDvQ,4373
+datachain/cli/parser/job.py,sha256=acdVYuTsqluRDI_FYhZ1ohjQcVtBj-taUm8y9tGb0_0,4580
 datachain/cli/parser/studio.py,sha256=Y-1OlQGecLVi9QofvWUfSlPd2ISyaESf7QFGZqGsrdw,3609
 datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -50,8 +51,8 @@ datachain/data_storage/metastore.py,sha256=vo2ab-U_-BKfeFYTmvpbCoMyMZEVxrVqM9Djj
 datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
 datachain/data_storage/sqlite.py,sha256=bwZAB_NUMT2WMv5tPQnnLFA0P-PiQtxzSaQ1q6xDxOU,24590
-datachain/data_storage/warehouse.py,sha256=3MFurg3Gm24xalWI03e_uHkznPxfhzTXjX7AhrKMiXQ,31516
-datachain/diff/__init__.py,sha256=YkGdiDbZIMhAZ2SJ4eSe00HU67VP1P6SL2L_t0ODYMs,9425
+datachain/data_storage/warehouse.py,sha256=RkdX1cunfmpDkRYRdOGNy0kLw7RekIokVl3Dd0i-hrA,31534
+datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
 datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
 datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
@@ -80,7 +81,7 @@ datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
 datachain/lib/pytorch.py,sha256=elrmJ4YUDC2LZ9yXM1KwImVBOYIBJf6k0ZR7eSe6Aao,7712
 datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
-datachain/lib/signal_schema.py,sha256=uIBHYXtu_XpLbOUVC-kq-GduEOCfz9hQORi9ZG3JFqo,35820
+datachain/lib/signal_schema.py,sha256=Zhg8qThFDf9eoNWFH6KGeYB-sIGys7A_ybq2CUBG7Dg,36127
 datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=FWqA476ygdk4MU-0qehYKxvnt8Tekh21Cyf3RgddD1k,16674
@@ -98,15 +99,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
 datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
 datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
 datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
-datachain/lib/dc/datachain.py,sha256=bshGTACAiTQP8dNvLKqhivLdK_rTa-Fx8OaOy3l3jSY,77550
-datachain/lib/dc/datasets.py,sha256=iDZj_wt0Wi3gsI6TqmiwU1xUVFYNXQ7xbw4MPxQfu8o,8044
+datachain/lib/dc/datachain.py,sha256=DIlbIyO3aUidDTo3S2pOrSDyrVhr49iLCFGgL_otSig,80558
+datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
 datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
 datachain/lib/dc/json.py,sha256=ZUThPDAaP2gBFIL5vsQTwKBcuN_dhvC_O44wdDv0jEc,2683
 datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
 datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
 datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
 datachain/lib/dc/records.py,sha256=J1I69J2gFIBjRTGr2LG-5qn_rTVzRLcr2y3tVDrmHdg,3068
-datachain/lib/dc/storage.py,sha256=Z0rqBNoss8RVHo8I3Bpc5hJBLTJfmJHRbV0SA_tR5LI,5635
+datachain/lib/dc/storage.py,sha256=YUlw3OtdRmYc2k24AmqjnqJK8k1H-onjh-mCxu_3BbE,8195
 datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
 datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
 datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -120,7 +121,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
 datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
-datachain/query/dataset.py,sha256=v9xVYyft06KNvsPrA-j59krZXwRAunKvyYyJsINZpxM,59114
+datachain/query/dataset.py,sha256=ac4mameklmZ-mnL3ZSzv5n8teaPnoXT8aWCdGlgkZE8,59904
 datachain/query/dispatch.py,sha256=15M3zlTUFKM6D2ijITX4o5QxCkRe2klkODsIDi3aQOg,15544
 datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -130,7 +131,7 @@ datachain/query/session.py,sha256=3nyOvPmLiA86IdHc3BL6Dt_himtHVvaDz_I1h3hZ_gI,65
 datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
 datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/remote/studio.py,sha256=4oypHea2NYB7br_BJTYPAmh3FH8KV835mq01Pwexrrg,13524
+datachain/remote/studio.py,sha256=pkgrhG7Bc5Z8RykgTg0S_XXiI8CpRnEbyXrOb5osgAM,13598
 datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
 datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
 datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -152,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.17.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.17.1.dist-info/METADATA,sha256=AsvQrTzywEMuvPD8NPPwUc7wsCL2W7SGbpi5i1YMMYY,11336
-datachain-0.17.1.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
-datachain-0.17.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.17.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.17.1.dist-info/RECORD,,
+datachain-0.18.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.18.0.dist-info/METADATA,sha256=seFHQYDt0EnbQiTRz-SixSCKTMFmF9p94Bd0E4lAyvY,11331
+datachain-0.18.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+datachain-0.18.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.18.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.18.0.dist-info/RECORD,,

{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.4.0)
+Generator: setuptools (80.7.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.17.1__py3-none-any.whl → 0.18.0__py3-none-any.whl

Potentially problematic release.

datachain 0.17.1py3-none-any.whl → 0.18.0py3-none-any.whl