PyPI - datachain - Versions diffs - 0.16.4__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

datachain 0.16.4py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (39) hide show

datachain/catalog/catalog.py +25 -92
datachain/cli/__init__.py +11 -9
datachain/cli/commands/datasets.py +1 -1
datachain/cli/commands/query.py +1 -0
datachain/cli/commands/show.py +1 -1
datachain/cli/parser/__init__.py +11 -3
datachain/data_storage/job.py +1 -0
datachain/data_storage/metastore.py +105 -94
datachain/data_storage/sqlite.py +8 -7
datachain/data_storage/warehouse.py +58 -46
datachain/dataset.py +88 -45
datachain/lib/arrow.py +23 -1
datachain/lib/dataset_info.py +2 -1
datachain/lib/dc/csv.py +1 -0
datachain/lib/dc/datachain.py +38 -16
datachain/lib/dc/datasets.py +28 -7
datachain/lib/dc/storage.py +10 -2
datachain/lib/listing.py +2 -0
datachain/lib/pytorch.py +2 -2
datachain/lib/udf.py +17 -5
datachain/listing.py +1 -1
datachain/query/batch.py +40 -39
datachain/query/dataset.py +42 -41
datachain/query/dispatch.py +137 -75
datachain/query/metrics.py +1 -2
datachain/query/queue.py +1 -11
datachain/query/session.py +2 -2
datachain/query/udf.py +1 -1
datachain/query/utils.py +8 -14
datachain/remote/studio.py +4 -4
datachain/semver.py +58 -0
datachain/studio.py +1 -1
datachain/utils.py +3 -0
{datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/METADATA +1 -1
{datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/RECORD +39 -38
{datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/WHEEL +1 -1
{datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/entry_points.txt +0 -0
{datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.16.4.dist-info → datachain-0.17.0.dist-info}/top_level.txt +0 -0

datachain/lib/dataset_info.py CHANGED Viewed

@@ -6,6 +6,7 @@ from uuid import uuid4
 from pydantic import Field, field_validator
 from datachain.dataset import (
+    DEFAULT_DATASET_VERSION,
     DatasetListRecord,
     DatasetListVersion,
     DatasetStatus,
@@ -22,7 +23,7 @@ if TYPE_CHECKING:
 class DatasetInfo(DataModel):
     name: str
     uuid: str = Field(default=str(uuid4()))
-    version: int = Field(default=1)
+    version: str = Field(default=DEFAULT_DATASET_VERSION)
     status: int = Field(default=DatasetStatus.CREATED)
     created_at: datetime = Field(default=TIME_ZERO)
     finished_at: Optional[datetime] = Field(default=None)

datachain/lib/dc/csv.py CHANGED Viewed

@@ -124,4 +124,5 @@ def read_csv(
         source=source,
         nrows=nrows,
         format=format,
+        parse_options=parse_options,
     )

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -23,6 +23,7 @@ import sqlalchemy
 from pydantic import BaseModel
 from tqdm import tqdm
+from datachain import semver
 from datachain.dataset import DatasetRecord
 from datachain.func import literal
 from datachain.func.base import Function
@@ -214,7 +215,7 @@ class DataChain:
         return self._query.name
     @property
-    def version(self) -> Optional[int]:
+    def version(self) -> Optional[str]:
         """Version of the underlying dataset, if there is one."""
         return self._query.version
@@ -457,7 +458,7 @@ class DataChain:
     def save(  # type: ignore[override]
         self,
         name: str,
-        version: Optional[int] = None,
+        version: Optional[str] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,
         **kwargs,
@@ -466,11 +467,15 @@ class DataChain:
         Parameters:
             name : dataset name.
-            version : version of a dataset. Default - the last version that exist.
+            version : version of a dataset. If version is not specified and dataset
+                already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
             description : description of a dataset.
             attrs : attributes of a dataset. They can be without value, e.g "NLP",
                 or with a value, e.g "location=US".
         """
+        if version is not None:
+            semver.validate(version)
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         return self._evolve(
             query=self._query.save(
@@ -1636,18 +1641,27 @@ class DataChain:
         """
         from pyarrow.dataset import CsvFileFormat, JsonFileFormat
-        from datachain.lib.arrow import ArrowGenerator, infer_schema, schema_to_output
+        from datachain.lib.arrow import (
+            ArrowGenerator,
+            fix_pyarrow_format,
+            infer_schema,
+            schema_to_output,
+        )
-        if nrows:
-            format = kwargs.get("format")
-            if format not in ["csv", "json"] and not isinstance(
-                format, (CsvFileFormat, JsonFileFormat)
-            ):
-                raise DatasetPrepareError(
-                    self.name,
-                    "error in `parse_tabular` - "
-                    "`nrows` only supported for csv and json formats.",
-                )
+        parse_options = kwargs.pop("parse_options", None)
+        if format := kwargs.get("format"):
+            kwargs["format"] = fix_pyarrow_format(format, parse_options)
+        if (
+            nrows
+            and format not in ["csv", "json"]
+            and not isinstance(format, (CsvFileFormat, JsonFileFormat))
+        ):
+            raise DatasetPrepareError(
+                self.name,
+                "error in `parse_tabular` - "
+                "`nrows` only supported for csv and json formats.",
+            )
         if "file" not in self.schema or not self.count():
             raise DatasetPrepareError(self.name, "no files to parse.")
@@ -1656,7 +1670,7 @@ class DataChain:
         col_names = output if isinstance(output, Sequence) else None
         if col_names or not output:
             try:
-                schema = infer_schema(self, **kwargs)
+                schema = infer_schema(self, **kwargs, parse_options=parse_options)
                 output, _ = schema_to_output(schema, col_names)
             except ValueError as e:
                 raise DatasetPrepareError(self.name, e) from e
@@ -1682,7 +1696,15 @@ class DataChain:
         # disable prefetch if nrows is set
         settings = {"prefetch": 0} if nrows else {}
         return self.settings(**settings).gen(  # type: ignore[arg-type]
-            ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
+            ArrowGenerator(
+                schema,
+                model,
+                source,
+                nrows,
+                parse_options=parse_options,
+                **kwargs,
+            ),
+            output=output,
         )
     @classmethod

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
+from datachain.error import DatasetVersionNotFoundError
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import (
     File,
@@ -22,7 +23,7 @@ if TYPE_CHECKING:
 def read_dataset(
     name: str,
-    version: Optional[int] = None,
+    version: Optional[Union[str, int]] = None,
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
@@ -49,7 +50,7 @@ def read_dataset(
         ```
         ```py
-        chain = dc.read_dataset("my_cats", version=1)
+        chain = dc.read_dataset("my_cats", version="1.0.0")
         ```
         ```py
@@ -63,7 +64,7 @@ def read_dataset(
         }
         chain = dc.read_dataset(
             name="my_cats",
-            version=1,
+            version="1.0.0",
             session=session,
             settings=settings,
             fallback_to_studio=True,
@@ -74,9 +75,29 @@ def read_dataset(
     from .datachain import DataChain
+    if version is not None:
+        try:
+            # for backward compatibility we still allow users to put version as integer
+            # in which case we are trying to find latest version where major part is
+            # equal to that input version. For example if user sets version=2, we could
+            # continue with something like 2.4.3 (assuming 2.4.3 is the biggest among
+            # all 2.* dataset versions). If dataset doesn't have any versions where
+            # major part is equal to that input, exception is thrown.
+            major = int(version)
+            dataset = Session.get(session).catalog.get_dataset(name)
+            latest_major = dataset.latest_major_version(major)
+            if not latest_major:
+                raise DatasetVersionNotFoundError(
+                    f"Dataset {name} does not have version {version}"
+                )
+            version = latest_major
+        except ValueError:
+            # version is in new semver string format, continuing as normal
+            pass
     query = DatasetQuery(
         name=name,
-        version=version,
+        version=version,  #  type: ignore[arg-type]
         session=session,
         indexing_column_types=File._datachain_column_types,
         fallback_to_studio=fallback_to_studio,
@@ -179,7 +200,7 @@ def datasets(
 def delete_dataset(
     name: str,
-    version: Optional[int] = None,
+    version: Optional[str] = None,
     force: Optional[bool] = False,
     studio: Optional[bool] = False,
     session: Optional[Session] = None,
@@ -207,7 +228,7 @@ def delete_dataset(
         ```py
         import datachain as dc
-        dc.delete_dataset("cats", version=1)
+        dc.delete_dataset("cats", version="1.0.0")
         ```
     """

datachain/lib/dc/storage.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import (
     Union,
 )
+from datachain.error import DatasetNotFoundError
 from datachain.lib.file import (
     FileType,
     get_file_type,
@@ -97,7 +98,8 @@ def read_storage(
     if anon:
         client_config = (client_config or {}) | {"anon": True}
     session = Session.get(session, client_config=client_config, in_memory=in_memory)
-    cache = session.catalog.cache
+    catalog = session.catalog
+    cache = catalog.cache
     client_config = session.catalog.client_config
     uris = uri if isinstance(uri, (list, tuple)) else [uri]
@@ -130,6 +132,11 @@ def read_storage(
             def lst_fn(ds_name, lst_uri):
                 # disable prefetch for listing, as it pre-downloads all files
+                try:
+                    version = catalog.get_dataset(ds_name).next_version_major
+                except DatasetNotFoundError:
+                    version = None
                 (
                     read_records(
                         DataChain.DEFAULT_FILE_RECORD,
@@ -142,7 +149,8 @@ def read_storage(
                         list_bucket(lst_uri, cache, client_config=client_config),
                         output={f"{column}": file_type},
                     )
-                    .save(ds_name, listing=True)
+                    # for internal listing datasets, we always bump major version
+                    .save(ds_name, listing=True, version=version)
                 )
             dc._query.set_listing_fn(

datachain/lib/listing.py CHANGED Viewed

@@ -56,6 +56,8 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
         for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
             yield from entries
+    list_func.__name__ = "read_storage"
     return list_func

datachain/lib/pytorch.py CHANGED Viewed

@@ -43,7 +43,7 @@ class PytorchDataset(IterableDataset):
     def __init__(
         self,
         name: str,
-        version: Optional[int] = None,
+        version: Optional[str] = None,
         catalog: Optional["Catalog"] = None,
         transform: Optional["Transform"] = None,
         tokenizer: Optional[Callable] = None,
@@ -60,7 +60,7 @@ class PytorchDataset(IterableDataset):
         Args:
             name (str): Name of DataChain dataset to stream.
-            version (int): Version of DataChain dataset to stream.
+            version (str): Version of DataChain dataset to stream.
             catalog (Catalog): DataChain catalog to which dataset belongs.
             transform (Transform): Torchvision transforms to apply to the dataset.
             tokenizer (Callable): Tokenizer to use to tokenize text values.

datachain/lib/udf.py CHANGED Viewed

@@ -218,6 +218,18 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
+    @property
+    def verbose_name(self):
+        """Returns the name of the function or class that implements the UDF."""
+        if self._func and callable(self._func):
+            if hasattr(self._func, "__name__"):
+                return self._func.__name__
+            if hasattr(self._func, "__class__") and hasattr(
+                self._func.__class__, "__name__"
+            ):
+                return self._func.__class__.__name__
+        return "<unknown>"
     @property
     def signal_names(self) -> Iterable[str]:
         return self.output.to_udf_spec().keys()
@@ -411,13 +423,13 @@ class BatchMapper(UDFBase):
         self.setup()
         for batch in udf_inputs:
-            n_rows = len(batch.rows)
+            n_rows = len(batch)
             row_ids, *udf_args = zip(
                 *[
                     self._prepare_row_and_id(
                         row, udf_fields, catalog, cache, download_cb
                     )
-                    for row in batch.rows
+                    for row in batch
                 ]
             )
             result_objs = list(self.process_safe(udf_args))
@@ -489,7 +501,7 @@ class Aggregator(UDFBase):
     def run(
         self,
-        udf_fields: "Sequence[str]",
+        udf_fields: Sequence[str],
         udf_inputs: Iterable[RowsOutputBatch],
         catalog: "Catalog",
         cache: bool,
@@ -502,13 +514,13 @@ class Aggregator(UDFBase):
             udf_args = zip(
                 *[
                     self._prepare_row(row, udf_fields, catalog, cache, download_cb)
-                    for row in batch.rows
+                    for row in batch
                 ]
             )
             result_objs = self.process_safe(udf_args)
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
-            processed_cb.relative_update(len(batch.rows))
+            processed_cb.relative_update(len(batch))
             yield output
         self.teardown()

datachain/listing.py CHANGED Viewed

@@ -26,7 +26,7 @@ class Listing:
         warehouse: "AbstractWarehouse",
         client: "Client",
         dataset_name: Optional["str"] = None,
-        dataset_version: Optional[int] = None,
+        dataset_version: Optional[str] = None,
         column: str = "file",
     ):
         self.metastore = metastore

datachain/query/batch.py CHANGED Viewed

@@ -2,22 +2,14 @@ import contextlib
 import math
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Sequence
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Optional, Union
-from datachain.data_storage.schema import PARTITION_COLUMN_ID
-from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
-from datachain.query.utils import get_query_column, get_query_id_column
-if TYPE_CHECKING:
-    from sqlalchemy import Select
+from typing import Callable, Optional, Union
+import sqlalchemy as sa
-@dataclass
-class RowsOutputBatch:
-    rows: Sequence[Sequence]
+from datachain.data_storage.schema import PARTITION_COLUMN_ID
+from datachain.query.utils import get_query_column
+RowsOutputBatch = Sequence[Sequence]
 RowsOutput = Union[Sequence, RowsOutputBatch]
@@ -30,8 +22,8 @@ class BatchingStrategy(ABC):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
     ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -47,12 +39,16 @@ class NoBatching(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
     ) -> Generator[Sequence, None, None]:
-        if ids_only:
-            query = query.with_only_columns(get_query_id_column(query))
-        return execute(query)
+        ids_only = False
+        if id_col is not None:
+            query = query.with_only_columns(id_col)
+            ids_only = True
+        rows = execute(query)
+        yield from (r[0] for r in rows) if ids_only else rows
 class Batch(BatchingStrategy):
@@ -69,27 +65,31 @@ class Batch(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
-    ) -> Generator[RowsOutputBatch, None, None]:
-        if ids_only:
-            query = query.with_only_columns(get_query_id_column(query))
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
+    ) -> Generator[RowsOutput, None, None]:
+        from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
+        ids_only = False
+        if id_col is not None:
+            query = query.with_only_columns(id_col)
+            ids_only = True
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
         # select rows in batches
-        results: list[Sequence] = []
+        results = []
-        with contextlib.closing(execute(query, page_size=page_size)) as rows:
-            for row in rows:
+        with contextlib.closing(execute(query, page_size=page_size)) as batch_rows:
+            for row in batch_rows:
                 results.append(row)
                 if len(results) >= self.count:
                     batch, results = results[: self.count], results[self.count :]
-                    yield RowsOutputBatch(batch)
+                    yield [r[0] for r in batch] if ids_only else batch
             if len(results) > 0:
-                yield RowsOutputBatch(results)
+                yield [r[0] for r in results] if ids_only else results
 class Partition(BatchingStrategy):
@@ -104,18 +104,19 @@ class Partition(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
-    ) -> Generator[RowsOutputBatch, None, None]:
-        id_col = get_query_id_column(query)
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
+    ) -> Generator[RowsOutput, None, None]:
         if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
             raise RuntimeError("partition column not found in query")
-        if ids_only:
+        ids_only = False
+        if id_col is not None:
             query = query.with_only_columns(id_col, partition_col)
+            ids_only = True
         current_partition: Optional[int] = None
-        batch: list[Sequence] = []
+        batch: list = []
         query_fields = [str(c.name) for c in query.selected_columns]
         id_column_idx = query_fields.index("sys__id")
@@ -132,9 +133,9 @@ class Partition(BatchingStrategy):
                 if current_partition != partition:
                     current_partition = partition
                     if len(batch) > 0:
-                        yield RowsOutputBatch(batch)
+                        yield batch
                         batch = []
-                batch.append([row[id_column_idx]] if ids_only else row)
+                batch.append(row[id_column_idx] if ids_only else row)
             if len(batch) > 0:
-                yield RowsOutputBatch(batch)
+                yield batch

datachain/query/dataset.py CHANGED Viewed

@@ -42,15 +42,9 @@ from datachain.data_storage.schema import (
     partition_columns,
 )
 from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
-from datachain.error import (
-    DatasetNotFoundError,
-    QueryScriptCancelError,
-)
+from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
-from datachain.lib.listing import (
-    is_listing_dataset,
-    listing_dataset_expired,
-)
+from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -89,7 +83,7 @@ PartitionByType = Union[
     Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
 ]
 JoinPredicateType = Union[str, ColumnClause, ColumnElement]
-DatasetDependencyType = tuple[str, int]
+DatasetDependencyType = tuple[str, str]
 logger = logging.getLogger("datachain")
@@ -174,7 +168,7 @@ class Step(ABC):
 class QueryStep:
     catalog: "Catalog"
     dataset_name: str
-    dataset_version: int
+    dataset_version: str
     def apply(self):
         def q(*columns):
@@ -420,41 +414,30 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
-        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
-        rows_total = self.catalog.warehouse.query_count(query)
-        if rows_total == 0:
+        if (rows_total := self.catalog.warehouse.query_count(query)) == 0:
             return
+        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
+        from datachain.catalog.loader import (
+            DISTRIBUTED_IMPORT_PATH,
+            get_udf_distributor_class,
+        )
         workers = determine_workers(self.workers, rows_total=rows_total)
         processes = determine_processes(self.parallel, rows_total=rows_total)
         use_partitioning = self.partition_by is not None
         batching = self.udf.get_batching(use_partitioning)
         udf_fields = [str(c.name) for c in query.selected_columns]
+        udf_distributor_class = get_udf_distributor_class()
         prefetch = self.udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
             catalog = clone_catalog_with_cache(self.catalog, _cache)
-            try:
-                if workers:
-                    if catalog.in_memory:
-                        raise RuntimeError(
-                            "In-memory databases cannot be used with "
-                            "distributed processing."
-                        )
-                    from datachain.catalog.loader import (
-                        DISTRIBUTED_IMPORT_PATH,
-                        get_udf_distributor_class,
-                    )
-                    if not (udf_distributor_class := get_udf_distributor_class()):
-                        raise RuntimeError(
-                            f"{DISTRIBUTED_IMPORT_PATH} import path is required "
-                            "for distributed UDF processing."
-                        )
+            try:
+                if udf_distributor_class and not catalog.in_memory:
+                    # Use the UDF distributor if available (running in SaaS)
                     udf_distributor = udf_distributor_class(
                         catalog=catalog,
                         table=udf_table,
@@ -470,7 +453,20 @@ class UDFStep(Step, ABC):
                         min_task_size=self.min_task_size,
                     )
                     udf_distributor()
-                elif processes:
+                    return
+                if workers:
+                    if catalog.in_memory:
+                        raise RuntimeError(
+                            "In-memory databases cannot be used with "
+                            "distributed processing."
+                        )
+                    raise RuntimeError(
+                        f"{DISTRIBUTED_IMPORT_PATH} import path is required "
+                        "for distributed UDF processing."
+                    )
+                if processes:
                     # Parallel processing (faster for more CPU-heavy UDFs)
                     if catalog.in_memory:
                         raise RuntimeError(
@@ -504,7 +500,12 @@ class UDFStep(Step, ABC):
                     with subprocess.Popen(  # noqa: S603
                         cmd, env=envs, stdin=subprocess.PIPE
                     ) as process:
-                        process.communicate(process_data)
+                        try:
+                            process.communicate(process_data)
+                        except KeyboardInterrupt:
+                            raise QueryScriptCancelError(
+                                "UDF execution was canceled by the user."
+                            ) from None
                         if retval := process.poll():
                             raise RuntimeError(
                                 f"UDF Execution Failed! Exit code: {retval}"
@@ -1091,7 +1092,7 @@ class DatasetQuery:
     def __init__(
         self,
         name: str,
-        version: Optional[int] = None,
+        version: Optional[str] = None,
         catalog: Optional["Catalog"] = None,
         session: Optional[Session] = None,
         indexing_column_types: Optional[dict[str, Any]] = None,
@@ -1111,7 +1112,7 @@ class DatasetQuery:
         self.table = self.get_table()
         self.starting_step: Optional[QueryStep] = None
         self.name: Optional[str] = None
-        self.version: Optional[int] = None
+        self.version: Optional[str] = None
         self.feature_schema: Optional[dict] = None
         self.column_types: Optional[dict[str, Any]] = None
         self.before_steps: list[Callable] = []
@@ -1154,7 +1155,7 @@ class DatasetQuery:
     def __or__(self, other):
         return self.union(other)
-    def pull_dataset(self, name: str, version: Optional[int] = None) -> "DatasetRecord":
+    def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
         print("Dataset not found in local catalog, trying to get from studio")
         remote_ds_uri = f"{DATASET_PREFIX}{name}"
@@ -1184,8 +1185,8 @@ class DatasetQuery:
         it completely. If this is the case, name and version of underlying dataset
         will be defined.
         DatasetQuery instance can become attached in two scenarios:
-            1. ds = DatasetQuery(name="dogs", version=1) -> ds is attached to dogs
-            2. ds = ds.save("dogs", version=1) -> ds is attached to dogs dataset
+            1. ds = DatasetQuery(name="dogs", version="1.0.0") -> ds is attached to dogs
+            2. ds = ds.save("dogs", version="1.0.0") -> ds is attached to dogs dataset
         It can move to detached state if filter or similar methods are called on it,
         as then it no longer 100% represents underlying datasets.
         """
@@ -1662,7 +1663,7 @@ class DatasetQuery:
         )
         return query
-    def _add_dependencies(self, dataset: "DatasetRecord", version: int):
+    def _add_dependencies(self, dataset: "DatasetRecord", version: str):
         for dependency in self.dependencies:
             ds_dependency_name, ds_dependency_version = dependency
             self.catalog.metastore.add_dataset_dependency(
@@ -1684,7 +1685,7 @@ class DatasetQuery:
     def save(
         self,
         name: Optional[str] = None,
-        version: Optional[int] = None,
+        version: Optional[str] = None,
         feature_schema: Optional[dict] = None,
         description: Optional[str] = None,
         attrs: Optional[list[str]] = None,

datachain 0.16.4__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

datachain 0.16.4py3-none-any.whl → 0.17.0py3-none-any.whl