PyPI - datachain - Versions diffs - 0.20.4__py3-none-any.whl → 0.21.1__py3-none-any.whl - Mend

datachain 0.20.4py3-none-any.whl → 0.21.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show

datachain/__init__.py +0 -2
datachain/cache.py +2 -2
datachain/catalog/catalog.py +65 -180
datachain/cli/__init__.py +11 -2
datachain/cli/commands/datasets.py +28 -43
datachain/cli/commands/ls.py +2 -2
datachain/cli/parser/__init__.py +35 -1
datachain/client/fsspec.py +3 -5
datachain/client/hf.py +0 -10
datachain/client/local.py +4 -4
datachain/data_storage/metastore.py +37 -405
datachain/data_storage/sqlite.py +7 -136
datachain/data_storage/warehouse.py +7 -26
datachain/dataset.py +12 -126
datachain/delta.py +7 -11
datachain/error.py +0 -36
datachain/func/func.py +1 -1
datachain/lib/arrow.py +3 -3
datachain/lib/dataset_info.py +0 -4
datachain/lib/dc/datachain.py +92 -260
datachain/lib/dc/datasets.py +50 -104
datachain/lib/dc/listings.py +3 -3
datachain/lib/dc/records.py +0 -1
datachain/lib/dc/storage.py +40 -38
datachain/lib/file.py +23 -77
datachain/lib/listing.py +1 -3
datachain/lib/meta_formats.py +1 -1
datachain/lib/pytorch.py +1 -1
datachain/lib/settings.py +0 -10
datachain/lib/tar.py +2 -1
datachain/lib/udf_signature.py +1 -1
datachain/lib/webdataset.py +20 -30
datachain/listing.py +1 -3
datachain/query/dataset.py +46 -71
datachain/query/session.py +1 -1
datachain/remote/studio.py +26 -61
datachain/studio.py +20 -27
{datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/METADATA +2 -2
{datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/RECORD +43 -47
datachain/lib/namespaces.py +0 -71
datachain/lib/projects.py +0 -86
datachain/namespace.py +0 -65
datachain/project.py +0 -78
{datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/WHEEL +0 -0
{datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/entry_points.txt +0 -0
{datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/licenses/LICENSE +0 -0
{datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/top_level.txt +0 -0

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -1,17 +1,11 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
-from datachain.dataset import parse_dataset_name
-from datachain.error import (
-    DatasetNotFoundError,
-    DatasetVersionNotFoundError,
-    ProjectNotFoundError,
-)
+from datachain.error import DatasetVersionNotFoundError
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import (
     File,
 )
-from datachain.lib.projects import get as get_project
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
 from datachain.query import Session
@@ -30,18 +24,12 @@ if TYPE_CHECKING:
 def read_dataset(
     name: str,
-    namespace: Optional[str] = None,
-    project: Optional[str] = None,
     version: Optional[Union[str, int]] = None,
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
     fallback_to_studio: bool = True,
     delta: Optional[bool] = False,
-    delta_on: Optional[Union[str, Sequence[str]]] = (
-        "file.path",
-        "file.etag",
-        "file.version",
-    ),
+    delta_on: Optional[Union[str, Sequence[str]]] = None,
     delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
     delta_retry: Optional[Union[bool, str]] = None,
@@ -50,36 +38,47 @@ def read_dataset(
     If dataset or version is not found locally, it will try to pull it from Studio.
     Parameters:
-        name: The dataset name, which can be a fully qualified name including the
-            namespace and project. Alternatively, it can be a regular name, in which
-            case the explicitly defined namespace and project will be used if they are
-            set; otherwise, default values will be applied.
-        namespace : optional name of namespace in which dataset to read is created
-        project : optional name of project in which dataset to read is created
+        name : dataset name
         version : dataset version
         session : Session to use for the chain.
         settings : Settings to use for the chain.
         fallback_to_studio : Try to pull dataset from Studio if not found locally.
             Default is True.
-        delta: If True, only process new or changed files instead of reprocessing
-            everything. This saves time by skipping files that were already processed in
-            previous versions. The optimization is working when a new version of the
-            dataset is created.
-            Default is False.
-        delta_on: Field(s) that uniquely identify each record in the source data.
-            Used to detect which records are new or changed.
-            Default is ("file.path", "file.etag", "file.version").
-        delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
-            Only needed if you rename the identifying fields during processing.
-            Default is None.
-        delta_compare: Field(s) used to detect if a record has changed.
-            If not specified, all fields except `delta_on` fields are used.
-            Default is None.
-        delta_retry: Controls retry behavior for failed records:
-            - String (field name): Reprocess records where this field is not empty
-              (error mode)
-            - True: Reprocess records missing from the result dataset (missing mode)
-            - None: No retry processing (default)
+        delta: If set to True, we optimize the creation of new dataset versions by
+            calculating the diff between the latest version of this storage and the
+            version used to create the most recent version of the resulting chain
+            dataset (the one specified in `.save()`). We then run the "diff" chain
+            using only the diff data, rather than the entire storage data, and merge
+            that diff chain with the latest version of the resulting dataset to create
+            a new version. This approach avoids applying modifications to all records
+            from storage every time, which can be an expensive operation.
+            The diff is calculated using the `DataChain.compare()` method, which
+            compares the `delta_on` fields to find matches and checks the compare
+            fields to determine if a record has changed. Note that this process only
+            considers added and modified records in storage; deleted records are not
+            removed from the new dataset version.
+            This calculation is based on the difference between the current version
+            of the source and the version used to create the dataset.
+        delta_on: A list of fields that uniquely identify rows in the source.
+            If two rows have the same values, they are considered the same (e.g., they
+            could be different versions of the same row in a versioned source).
+            This is used in the delta update to calculate the diff.
+        delta_result_on: A list of fields in the resulting dataset that correspond
+            to the `delta_on` fields from the source.
+            This is needed to identify rows that have changed in the source but are
+            already present in the current version of the resulting dataset, in order
+            to avoid including outdated versions of those rows in the new dataset.
+            We retain only the latest versions of rows to prevent duplication.
+            There is no need to define this if the `delta_on` fields are present in
+            the final dataset and have not been renamed.
+        delta_compare: A list of fields used to check if the same row has been modified
+            in the new version of the source.
+            If not defined, all fields except those defined in delta_on will be used.
+        delta_retry: Specifies retry behavior for delta processing. If a string,
+            it's the name of a field in the result dataset that indicates an error
+            when not None - records with errors will be reprocessed. If True,
+            records that exist in the source dataset but not in the result dataset
+            will be reprocessed.
     Example:
         ```py
@@ -87,11 +86,6 @@ def read_dataset(
         chain = dc.read_dataset("my_cats")
         ```
-        ```py
-        import datachain as dc
-        chain = dc.read_dataset("dev.animals.my_cats")
-        ```
         ```py
         chain = dc.read_dataset("my_cats", fallback_to_studio=False)
         ```
@@ -122,15 +116,6 @@ def read_dataset(
     from .datachain import DataChain
-    session = Session.get(session)
-    catalog = session.catalog
-    namespace_name, project_name, name = parse_dataset_name(name)
-    namespace_name = (
-        namespace_name or namespace or catalog.metastore.default_namespace_name
-    )
-    project_name = project_name or project or catalog.metastore.default_project_name
     if version is not None:
         try:
             # for backward compatibility we still allow users to put version as integer
@@ -140,15 +125,7 @@ def read_dataset(
             # all 2.* dataset versions). If dataset doesn't have any versions where
             # major part is equal to that input, exception is thrown.
             major = int(version)
-            try:
-                ds_project = get_project(project_name, namespace_name, session=session)
-            except ProjectNotFoundError:
-                raise DatasetNotFoundError(
-                    f"Dataset {name} not found in namespace {namespace_name} and",
-                    f" project {project_name}",
-                ) from None
-            dataset = session.catalog.get_dataset(name, ds_project)
+            dataset = Session.get(session).catalog.get_dataset(name)
             latest_major = dataset.latest_major_version(major)
             if not latest_major:
                 raise DatasetVersionNotFoundError(
@@ -159,22 +136,19 @@ def read_dataset(
             # version is in new semver string format, continuing as normal
             pass
-    if settings:
-        _settings = Settings(**settings)
-    else:
-        _settings = Settings()
     query = DatasetQuery(
         name=name,
-        project_name=project_name,
-        namespace_name=namespace_name,
         version=version,  #  type: ignore[arg-type]
         session=session,
         indexing_column_types=File._datachain_column_types,
         fallback_to_studio=fallback_to_studio,
     )
     telemetry.send_event_once("class", "datachain_init", name=name, version=version)
+    if settings:
+        _settings = Settings(**settings)
+    else:
+        _settings = Settings()
     signals_schema = SignalSchema({"sys": Sys})
     if query.feature_schema:
         signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -226,7 +200,7 @@ def datasets(
         import datachain as dc
         chain = dc.datasets(column="dataset")
-        for ds in chain.to_iter("dataset"):
+        for ds in chain.collect("dataset"):
             print(f"{ds.name}@v{ds.version}")
         ```
     """
@@ -277,8 +251,6 @@ def datasets(
 def delete_dataset(
     name: str,
-    namespace: Optional[str] = None,
-    project: Optional[str] = None,
     version: Optional[str] = None,
     force: Optional[bool] = False,
     studio: Optional[bool] = False,
@@ -289,16 +261,11 @@ def delete_dataset(
     a force flag.
     Args:
-        name: The dataset name, which can be a fully qualified name including the
-            namespace and project. Alternatively, it can be a regular name, in which
-            case the explicitly defined namespace and project will be used if they are
-            set; otherwise, default values will be applied.
-        namespace : optional name of namespace in which dataset to delete is created
-        project : optional name of project in which dataset to delete is created
+        name : Dataset name
         version : Optional dataset version
         force: If true, all datasets versions will be removed. Defaults to False.
-        studio: If True, removes dataset from Studio only, otherwise removes local
-            dataset. Defaults to False.
+        studio: If True, removes dataset from Studio only,
+            otherwise remove from local. Defaults to False.
         session: Optional session instance. If not provided, uses default session.
         in_memory: If True, creates an in-memory session. Defaults to False.
@@ -315,32 +282,11 @@ def delete_dataset(
         dc.delete_dataset("cats", version="1.0.0")
         ```
     """
-    from datachain.studio import remove_studio_dataset
     session = Session.get(session, in_memory=in_memory)
     catalog = session.catalog
-    namespace_name, project_name, name = parse_dataset_name(name)
-    namespace_name = (
-        namespace_name or namespace or catalog.metastore.default_namespace_name
-    )
-    project_name = project_name or project or catalog.metastore.default_project_name
-    if not catalog.metastore.is_local_dataset(namespace_name) and studio:
-        return remove_studio_dataset(
-            None, name, namespace_name, project_name, version=version, force=force
-        )
-    try:
-        ds_project = get_project(project_name, namespace_name, session=session)
-    except ProjectNotFoundError:
-        raise DatasetNotFoundError(
-            f"Dataset {name} not found in namespace {namespace_name} and project",
-            f" {project_name}",
-        ) from None
     if not force:
-        version = version or catalog.get_dataset(name, ds_project).latest_version
+        version = version or catalog.get_dataset(name).latest_version
     else:
         version = None
-    catalog.remove_dataset(name, ds_project, version=version, force=force)
+    catalog.remove_dataset(name, version=version, force=force, studio=studio)

datachain/lib/dc/listings.py CHANGED Viewed

@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
             return sa.select(*columns)
         table_name = self.catalog.warehouse.dataset_table_name(
-            self.dataset, self.dataset_version
+            self.dataset_name, self.dataset_version
         )
         dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
         table = dataset_row_cls.new_table(
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
         )
         return step_result(
-            q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
+            q, table.columns, dependencies=[(self.dataset_name, self.dataset_version)]
         )
@@ -142,7 +142,7 @@ def read_listing_dataset(
         _settings = Settings(prefetch=0)
     signal_schema = SignalSchema({"sys": Sys, "file": File})
-    query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
+    query.starting_step = ReadOnlyQueryStep(query.catalog, name, version)
     query.version = version
     # We already know that this is a listing dataset,
     # so we can set the listing function to True

datachain/lib/dc/records.py CHANGED Viewed

@@ -68,7 +68,6 @@ def read_records(
     dsr = catalog.create_dataset(
         name,
-        catalog.metastore.default_project,
         columns=columns,
         feature_schema=(
             signal_schema.clone_without_sys_signals().serialize()

datachain/lib/dc/storage.py CHANGED Viewed

@@ -35,11 +35,7 @@ def read_storage(
     update: bool = False,
     anon: bool = False,
     delta: Optional[bool] = False,
-    delta_on: Optional[Union[str, Sequence[str]]] = (
-        "file.path",
-        "file.etag",
-        "file.version",
-    ),
+    delta_on: Optional[Union[str, Sequence[str]]] = None,
     delta_result_on: Optional[Union[str, Sequence[str]]] = None,
     delta_compare: Optional[Union[str, Sequence[str]]] = None,
     delta_retry: Optional[Union[bool, str]] = None,
@@ -58,25 +54,43 @@ def read_storage(
         update : force storage reindexing. Default is False.
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
-        delta: If True, only process new or changed files instead of reprocessing
-            everything. This saves time by skipping files that were already processed in
-            previous versions. The optimization is working when a new version of the
-            dataset is created.
-            Default is False.
-        delta_on: Field(s) that uniquely identify each record in the source data.
-            Used to detect which records are new or changed.
-            Default is ("file.path", "file.etag", "file.version").
-        delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
-            Only needed if you rename the identifying fields during processing.
-            Default is None.
-        delta_compare: Field(s) used to detect if a record has changed.
-            If not specified, all fields except `delta_on` fields are used.
-            Default is None.
-        delta_retry: Controls retry behavior for failed records:
-            - String (field name): Reprocess records where this field is not empty
-              (error mode)
-            - True: Reprocess records missing from the result dataset (missing mode)
-            - None: No retry processing (default)
+        delta: If set to True, we optimize the creation of new dataset versions by
+            calculating the diff between the latest version of this storage and the
+            version used to create the most recent version of the resulting chain
+            dataset (the one specified in `.save()`). We then run the "diff" chain
+            using only the diff data, rather than the entire storage data, and merge
+            that diff chain with the latest version of the resulting dataset to create
+            a new version. This approach avoids applying modifications to all records
+            from storage every time, which can be an expensive operation.
+            The diff is calculated using the `DataChain.compare()` method, which
+            compares the `delta_on` fields to find matches and checks the compare
+            fields to determine if a record has changed. Note that this process only
+            considers added and modified records in storage; deleted records are not
+            removed from the new dataset version.
+            This calculation is based on the difference between the current version
+            of the source and the version used to create the dataset.
+        delta_on: A list of fields that uniquely identify rows in the source.
+            If two rows have the same values, they are considered the same (e.g., they
+            could be different versions of the same row in a versioned source).
+            This is used in the delta update to calculate the diff.
+        delta_result_on: A list of fields in the resulting dataset that correspond
+            to the `delta_on` fields from the source.
+            This is needed to identify rows that have changed in the source but are
+            already present in the current version of the resulting dataset, in order
+            to avoid including outdated versions of those rows in the new dataset.
+            We retain only the latest versions of rows to prevent duplication.
+            There is no need to define this if the `delta_on` fields are present in
+            the final dataset and have not been renamed.
+        delta_compare: A list of fields used to check if the same row has been modified
+            in the new version of the source.
+            If not defined, all fields except those defined in `delta_on` will be used.
+        delta_retry: Controls which records to reprocess. Can be:
+            - A string specifying a field name: Records where this field is not None
+              will be reprocessed (error checking mode).
+            - True: Records that exist in the source dataset but not in the result
+              dataset (based on delta_on/delta_result_on fields) will be reprocessed
+              (missing records mode).
+            - False or None: No retry processing.
     Returns:
         DataChain: A DataChain object containing the file information.
@@ -130,8 +144,6 @@ def read_storage(
     catalog = session.catalog
     cache = catalog.cache
     client_config = session.catalog.client_config
-    listing_namespace_name = catalog.metastore.system_namespace_name
-    listing_project_name = catalog.metastore.listing_project_name
     uris = uri if isinstance(uri, (list, tuple)) else [uri]
@@ -155,13 +167,7 @@ def read_storage(
             )
             continue
-        dc = read_dataset(
-            list_ds_name,
-            namespace=listing_namespace_name,
-            project=listing_project_name,
-            session=session,
-            settings=settings,
-        )
+        dc = read_dataset(list_ds_name, session=session, settings=settings)
         dc._query.update = update
         dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
@@ -176,11 +182,7 @@ def read_storage(
                         settings=settings,
                         in_memory=in_memory,
                     )
-                    .settings(
-                        prefetch=0,
-                        namespace=listing_namespace_name,
-                        project=listing_project_name,
-                    )
+                    .settings(prefetch=0)
                     .gen(
                         list_bucket(lst_uri, cache, client_config=client_config),
                         output={f"{column}": file_type},

datachain/lib/file.py CHANGED Viewed

@@ -5,14 +5,13 @@ import json
 import logging
 import os
 import posixpath
-import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from contextlib import contextmanager
 from datetime import datetime
 from functools import partial
 from io import BytesIO
-from pathlib import Path, PurePath, PurePosixPath
+from pathlib import Path, PurePosixPath
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
 from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
@@ -70,7 +69,7 @@ class FileExporter(NodesThreadPool):
         for task in done:
             task.result()
-    def do_task(self, file: "File"):
+    def do_task(self, file):
         file.export(
             self.output,
             self.placement,
@@ -275,8 +274,8 @@ class File(DataModel):
     @field_validator("path", mode="before")
     @classmethod
-    def validate_path(cls, path: str) -> str:
-        return PurePath(path).as_posix() if path else ""
+    def validate_path(cls, path):
+        return Path(path).as_posix() if path else ""
     def model_dump_custom(self):
         res = self.model_dump()
@@ -338,11 +337,11 @@ class File(DataModel):
         return cls(**{key: row[key] for key in cls._datachain_column_types})
     @property
-    def name(self) -> str:
+    def name(self):
         return PurePosixPath(self.path).name
     @property
-    def parent(self) -> str:
+    def parent(self):
         return str(PurePosixPath(self.path).parent)
     @contextmanager
@@ -392,7 +391,7 @@ class File(DataModel):
         client.upload(self.read(), destination)
-    def _symlink_to(self, destination: str) -> None:
+    def _symlink_to(self, destination: str):
         if self.location:
             raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
@@ -401,7 +400,7 @@ class File(DataModel):
             source = self.get_local_path()
             assert source, "File was not cached"
         elif self.source.startswith("file://"):
-            source = self.get_fs_path()
+            source = self.get_path()
         else:
             raise OSError(errno.EXDEV, "can't link across filesystems")
@@ -482,62 +481,27 @@ class File(DataModel):
     def get_file_ext(self):
         """Returns last part of file name without `.`."""
-        return PurePosixPath(self.path).suffix.lstrip(".")
+        return PurePosixPath(self.path).suffix.strip(".")
     def get_file_stem(self):
         """Returns file name without extension."""
         return PurePosixPath(self.path).stem
     def get_full_name(self):
-        """
-        [DEPRECATED] Use `file.path` directly instead.
-        Returns name with parent directories.
-        """
-        warnings.warn(
-            "file.get_full_name() is deprecated and will be removed "
-            "in a future version. Use `file.path` directly.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
+        """Returns name with parent directories."""
         return self.path
-    def get_path_normalized(self) -> str:
-        if not self.path:
-            raise FileError("path must not be empty", self.source, self.path)
-        if self.path.endswith("/"):
-            raise FileError("path must not be a directory", self.source, self.path)
-        normpath = os.path.normpath(self.path)
-        normpath = PurePath(normpath).as_posix()
-        if normpath == ".":
-            raise FileError("path must not be a directory", self.source, self.path)
-        if any(part == ".." for part in PurePath(normpath).parts):
-            raise FileError("path must not contain '..'", self.source, self.path)
-        return normpath
-    def get_uri(self) -> str:
+    def get_uri(self):
         """Returns file URI."""
-        return f"{self.source}/{self.get_path_normalized()}"
+        return f"{self.source}/{self.get_full_name()}"
-    def get_fs_path(self) -> str:
-        """
-        Returns file path with respect to the filescheme.
-        If `normalize` is True, the path is normalized to remove any redundant
-        separators and up-level references.
-        If the file scheme is "file", the path is converted to a local file path
-        using `url2pathname`. Otherwise, the original path with scheme is returned.
-        """
+    def get_path(self) -> str:
+        """Returns file path."""
         path = unquote(self.get_uri())
-        path_parsed = urlparse(path)
-        if path_parsed.scheme == "file":
-            path = url2pathname(path_parsed.path)
+        source = urlparse(self.source)
+        if source.scheme == "file":
+            path = urlparse(path).path
+            path = url2pathname(path)
         return path
     def get_destination_path(
@@ -552,7 +516,7 @@ class File(DataModel):
         elif placement == "etag":
             path = f"{self.etag}{self.get_file_suffix()}"
         elif placement == "fullpath":
-            path = unquote(self.get_path_normalized())
+            path = unquote(self.get_full_name())
             source = urlparse(self.source)
             if source.scheme and source.scheme != "file":
                 path = posixpath.join(source.netloc, path)
@@ -590,9 +554,8 @@ class File(DataModel):
             ) from e
         try:
-            normalized_path = self.get_path_normalized()
-            info = client.fs.info(client.get_full_path(normalized_path))
-            converted_info = client.info_to_file(info, normalized_path)
+            info = client.fs.info(client.get_full_path(self.path))
+            converted_info = client.info_to_file(info, self.path)
             return type(self)(
                 path=self.path,
                 source=self.source,
@@ -603,17 +566,8 @@ class File(DataModel):
                 last_modified=converted_info.last_modified,
                 location=self.location,
             )
-        except FileError as e:
-            logger.warning(
-                "File error when resolving %s/%s: %s", self.source, self.path, str(e)
-            )
         except (FileNotFoundError, PermissionError, OSError) as e:
-            logger.warning(
-                "File system error when resolving %s/%s: %s",
-                self.source,
-                self.path,
-                str(e),
-            )
+            logger.warning("File system error when resolving %s: %s", self.path, str(e))
         return type(self)(
             path=self.path,
@@ -629,8 +583,6 @@ class File(DataModel):
 def resolve(file: File) -> File:
     """
-    [DEPRECATED] Use `file.resolve()` directly instead.
     Resolve a File object by checking its existence and updating its metadata.
     This function is a wrapper around the File.resolve() method, designed to be
@@ -646,12 +598,6 @@ def resolve(file: File) -> File:
         RuntimeError: If the file's catalog is not set or if
         the file source protocol is unsupported.
     """
-    warnings.warn(
-        "resolve() is deprecated and will be removed "
-        "in a future version. Use file.resolve() directly.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
     return file.resolve()
@@ -999,7 +945,7 @@ class ArrowRow(DataModel):
             ds = dataset(path, **self.kwargs)
         else:
-            path = self.file.get_fs_path()
+            path = self.file.get_path()
             ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
         return ds.take([self.index]).to_reader()

datachain/lib/listing.py CHANGED Viewed

@@ -123,9 +123,6 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
         f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
     )
-    # we should remove dots from the name
-    ds_name = ds_name.replace(".", "_")
     return ds_name, lst_uri, path
@@ -198,4 +195,5 @@ def get_listing(
         list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
     ds_name = listing.name if listing else ds_name
     return ds_name, list_uri, list_path, bool(listing)

datachain/lib/meta_formats.py CHANGED Viewed

@@ -106,7 +106,7 @@ def read_meta(  # noqa: C901
     from datachain import read_storage
     if schema_from:
-        file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
+        file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
         model_code = gen_datamodel_code(
             file, format=format, jmespath=jmespath, model_name=model_name
         )

datachain/lib/pytorch.py CHANGED Viewed

@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
         if self.num_samples > 0:
             ds = ds.sample(self.num_samples)
         ds = ds.chunk(total_rank, total_workers)
-        yield from ds.to_iter()
+        yield from ds.collect()
     def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
         from datachain.lib.udf import _prefetch_inputs

datachain 0.20.4__py3-none-any.whl → 0.21.1__py3-none-any.whl

Potentially problematic release.

datachain 0.20.4py3-none-any.whl → 0.21.1py3-none-any.whl