PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/file.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import errno
 import hashlib
 import io
-import json
 import logging
 import os
 import posixpath
@@ -13,7 +12,7 @@ from datetime import datetime
 from functools import partial
 from io import BytesIO
 from pathlib import Path, PurePath, PurePosixPath
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
@@ -21,6 +20,7 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from fsspec.utils import stringify_path
 from pydantic import Field, field_validator
+from datachain import json
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError, rebase_path
@@ -35,13 +35,14 @@ if TYPE_CHECKING:
     from datachain.catalog import Catalog
     from datachain.client.fsspec import Client
     from datachain.dataset import RowDict
+    from datachain.query.session import Session
 sha256 = partial(hashlib.sha256, usedforsecurity=False)
 logger = logging.getLogger("datachain")
 # how to create file path when exporting
-ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
+ExportPlacement = Literal["filename", "etag", "fullpath", "checksum", "filepath"]
 FileType = Literal["binary", "text", "image", "video", "audio"]
 EXPORT_FILES_MAX_THREADS = 5
@@ -52,12 +53,12 @@ class FileExporter(NodesThreadPool):
     def __init__(
         self,
-        output: Union[str, os.PathLike[str]],
+        output: str | os.PathLike[str],
         placement: ExportPlacement,
         use_cache: bool,
         link_type: Literal["copy", "symlink"],
         max_threads: int = EXPORT_FILES_MAX_THREADS,
-        client_config: Optional[dict] = None,
+        client_config: dict | None = None,
     ):
         super().__init__(max_threads)
         self.output = output
@@ -220,7 +221,7 @@ class File(DataModel):
     etag: str = Field(default="")
     is_latest: bool = Field(default=True)
     last_modified: datetime = Field(default=TIME_ZERO)
-    location: Optional[Union[dict, list[dict]]] = Field(default=None)
+    location: dict | list[dict] | None = Field(default=None)
     _datachain_column_types: ClassVar[dict[str, Any]] = {
         "source": String,
@@ -252,10 +253,19 @@ class File(DataModel):
         "last_modified",
     ]
+    # Allowed kwargs we forward to TextIOWrapper
+    _TEXT_WRAPPER_ALLOWED: ClassVar[tuple[str, ...]] = (
+        "encoding",
+        "errors",
+        "newline",
+        "line_buffering",
+        "write_through",
+    )
     @staticmethod
     def _validate_dict(
-        v: Optional[Union[str, dict, list[dict]]],
-    ) -> Optional[Union[str, dict, list[dict]]]:
+        v: str | dict | list[dict] | None,
+    ) -> str | dict | list[dict] | None:
         if v is None or v == "":
             return None
         if isinstance(v, str):
@@ -287,6 +297,16 @@ class File(DataModel):
         super().__init__(**kwargs)
         self._catalog = None
         self._caching_enabled: bool = False
+        self._download_cb: Callback = DEFAULT_CALLBACK
+    def __getstate__(self):
+        state = super().__getstate__()
+        # Exclude _catalog from pickling - it contains SQLAlchemy engine and other
+        # non-picklable objects. The catalog will be re-set by _set_stream() on the
+        # worker side when needed.
+        state["__dict__"] = state["__dict__"].copy()
+        state["__dict__"]["_catalog"] = None
+        return state
     def as_text_file(self) -> "TextFile":
         """Convert the file to a `TextFile` object."""
@@ -322,17 +342,21 @@ class File(DataModel):
     @classmethod
     def upload(
-        cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
+        cls,
+        data: bytes,
+        path: str | os.PathLike[str],
+        catalog: "Catalog | None" = None,
     ) -> "Self":
         if catalog is None:
-            from datachain.catalog.loader import get_catalog
-            catalog = get_catalog()
+            from datachain.query.session import Session
+            catalog = Session.get().catalog
         from datachain.client.fsspec import Client
-        client_cls = Client.get_implementation(path)
-        source, rel_path = client_cls.split_url(path)
+        path_str = stringify_path(path)
+        client_cls = Client.get_implementation(path_str)
+        source, rel_path = client_cls.split_url(path_str)
         client = catalog.get_client(client_cls.get_uri(source))
         file = client.upload(data, rel_path)
@@ -341,6 +365,35 @@ class File(DataModel):
         file._set_stream(catalog)
         return file
+    @classmethod
+    def at(
+        cls, uri: str | os.PathLike[str], session: "Session | None" = None
+    ) -> "Self":
+        """Construct a File from a full URI in one call.
+        Example:
+            file = File.at("s3://bucket/path/to/output.png")
+            with file.open("wb") as f: ...
+        """
+        from datachain.client.fsspec import Client
+        from datachain.query.session import Session
+        if session is None:
+            session = Session.get()
+        catalog = session.catalog
+        uri_str = stringify_path(uri)
+        if uri_str.endswith(("/", os.sep)):
+            raise ValueError(
+                f"File.at directory URL/path given (trailing slash), got: {uri_str}"
+            )
+        client_cls = Client.get_implementation(uri_str)
+        uri_str = client_cls.path_to_uri(uri_str)
+        source, rel_path = client_cls.split_url(uri_str)
+        source_uri = client_cls.get_uri(source)
+        file = cls(source=source_uri, path=rel_path)
+        file._set_stream(catalog)
+        return file
     @classmethod
     def _from_row(cls, row: "RowDict") -> "Self":
         return cls(**{key: row[key] for key in cls._datachain_column_types})
@@ -354,28 +407,93 @@ class File(DataModel):
         return str(PurePosixPath(self.path).parent)
     @contextmanager
-    def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
-        """Open the file and return a file object."""
-        if self.location:
-            with VFileRegistry.open(self, self.location) as f:  # type: ignore[arg-type]
-                yield f
+    def open(
+        self,
+        mode: str = "rb",
+        *,
+        client_config: dict[str, Any] | None = None,
+        **open_kwargs,
+    ) -> Iterator[Any]:
+        """Open the file and return a file-like object.
+        Supports both read ("rb", "r") and write modes (e.g. "wb", "w", "ab").
+        When opened in a write mode, metadata is refreshed after closing.
+        """
+        writing = any(ch in mode for ch in "wax+")
+        if self.location and writing:
+            raise VFileError(
+                "Writing to virtual file is not supported",
+                self.source,
+                self.path,
+            )
-        else:
+        if self._catalog is None:
+            raise RuntimeError("Cannot open file: catalog is not set")
+        base_cfg = getattr(self._catalog, "client_config", {}) or {}
+        merged_cfg = {**base_cfg, **(client_config or {})}
+        client: Client = self._catalog.get_client(self.source, **merged_cfg)
+        if not writing:
+            if self.location:
+                with VFileRegistry.open(self, self.location) as f:  # type: ignore[arg-type]
+                    yield self._wrap_text(f, mode, open_kwargs)
+                return
             if self._caching_enabled:
                 self.ensure_cached()
-            client: Client = self._catalog.get_client(self.source)
             with client.open_object(
                 self, use_cache=self._caching_enabled, cb=self._download_cb
             ) as f:
-                yield io.TextIOWrapper(f) if mode == "r" else f
+                yield self._wrap_text(f, mode, open_kwargs)
+            return
+        # write path
+        full_path = client.get_full_path(self.get_path_normalized())
+        with client.fs.open(full_path, mode, **open_kwargs) as f:
+            yield self._wrap_text(f, mode, open_kwargs)
+        version_hint = self._extract_write_version(f)
+        # refresh metadata pinned to the version that was just written
+        refreshed = client.get_file_info(
+            self.get_path_normalized(), version_id=version_hint
+        )
+        for k, v in refreshed.model_dump().items():
+            setattr(self, k, v)
+    def _wrap_text(self, f: Any, mode: str, open_kwargs: dict[str, Any]) -> Any:
+        """Return stream possibly wrapped for text."""
+        if "b" in mode or isinstance(f, io.TextIOBase):
+            return f
+        filtered = {
+            k: open_kwargs[k] for k in self._TEXT_WRAPPER_ALLOWED if k in open_kwargs
+        }
+        return io.TextIOWrapper(f, **filtered)
+    def _extract_write_version(self, handle: Any) -> str | None:
+        """Best-effort extraction of object version after a write.
+        S3 (s3fs) and Azure (adlfs) populate version_id on the handle.
+        GCS (gcsfs) populates generation. Azure and GCS require upstream
+        fixes to be released.
+        """
+        for attr in ("version_id", "generation"):
+            if value := getattr(handle, attr, None):
+                return value
+        return None
     def read_bytes(self, length: int = -1):
         """Returns file contents as bytes."""
         with self.open() as stream:
             return stream.read(length)
-    def read_text(self):
-        """Returns file contents as text."""
+    def read_text(self, **open_kwargs):
+        """Return file contents decoded as text.
+        **open_kwargs : Any
+            Extra keyword arguments forwarded to ``open(mode="r", ...)``
+            (e.g. ``encoding="utf-8"``, ``errors="ignore"``)
+        """
         if self.location:
             raise VFileError(
                 "Reading text from virtual file is not supported",
@@ -383,14 +501,14 @@ class File(DataModel):
                 self.path,
             )
-        with self.open(mode="r") as stream:
+        with self.open(mode="r", **open_kwargs) as stream:
             return stream.read()
     def read(self, length: int = -1):
         """Returns file contents."""
         return self.read_bytes(length)
-    def save(self, destination: str, client_config: Optional[dict] = None):
+    def save(self, destination: str, client_config: dict | None = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
         client: Client = self._catalog.get_client(destination, **(client_config or {}))
@@ -417,11 +535,11 @@ class File(DataModel):
     def export(
         self,
-        output: Union[str, os.PathLike[str]],
+        output: str | os.PathLike[str],
         placement: ExportPlacement = "fullpath",
         use_cache: bool = True,
         link_type: Literal["copy", "symlink"] = "copy",
-        client_config: Optional[dict] = None,
+        client_config: dict | None = None,
     ) -> None:
         """Export file to new location."""
         self._caching_enabled = use_cache
@@ -457,7 +575,7 @@ class File(DataModel):
         client = self._catalog.get_client(self.source)
         client.download(self, callback=self._download_cb)
-    async def _prefetch(self, download_cb: Optional["Callback"] = None) -> bool:
+    async def _prefetch(self, download_cb: "Callback | None" = None) -> bool:
         if self._catalog is None:
             raise RuntimeError("cannot prefetch file because catalog is not setup")
@@ -472,7 +590,7 @@ class File(DataModel):
         )
         return True
-    def get_local_path(self) -> Optional[str]:
+    def get_local_path(self) -> str | None:
         """Return path to a file in a local cache.
         Returns None if file is not cached.
@@ -549,7 +667,7 @@ class File(DataModel):
         return path
     def get_destination_path(
-        self, output: Union[str, os.PathLike[str]], placement: ExportPlacement
+        self, output: str | os.PathLike[str], placement: ExportPlacement
     ) -> str:
         """
         Returns full destination path of a file for exporting to some output
@@ -564,6 +682,8 @@ class File(DataModel):
             source = urlparse(self.source)
             if source.scheme and source.scheme != "file":
                 path = posixpath.join(source.netloc, path)
+        elif placement == "filepath":
+            path = unquote(self.get_path_normalized())
         elif placement == "checksum":
             raise NotImplementedError("Checksum placement not implemented yet")
         else:
@@ -601,7 +721,7 @@ class File(DataModel):
             normalized_path = self.get_path_normalized()
             info = client.fs.info(client.get_full_path(normalized_path))
             converted_info = client.info_to_file(info, normalized_path)
-            return type(self)(
+            res = type(self)(
                 path=self.path,
                 source=self.source,
                 size=converted_info.size,
@@ -611,6 +731,8 @@ class File(DataModel):
                 last_modified=converted_info.last_modified,
                 location=self.location,
             )
+            res._set_stream(self._catalog)
+            return res
         except FileError as e:
             logger.warning(
                 "File error when resolving %s/%s: %s", self.source, self.path, str(e)
@@ -623,7 +745,7 @@ class File(DataModel):
                 str(e),
             )
-        return type(self)(
+        res = type(self)(
             path=self.path,
             source=self.source,
             size=0,
@@ -633,6 +755,8 @@ class File(DataModel):
             last_modified=TIME_ZERO,
             location=self.location,
         )
+        res._set_stream(self._catalog)
+        return res
     def rebase(
         self,
@@ -701,17 +825,30 @@ class TextFile(File):
     """`DataModel` for reading text files."""
     @contextmanager
-    def open(self, mode: Literal["rb", "r"] = "r"):
-        """Open the file and return a file object (default to text mode)."""
-        with super().open(mode=mode) as stream:
+    def open(
+        self,
+        mode: str = "r",
+        *,
+        client_config: dict[str, Any] | None = None,
+        **open_kwargs,
+    ) -> Iterator[Any]:
+        """Open the file and return a file-like object.
+        Default to text mode"""
+        with super().open(
+            mode=mode, client_config=client_config, **open_kwargs
+        ) as stream:
             yield stream
-    def read_text(self):
-        """Returns file contents as text."""
-        with self.open() as stream:
+    def read_text(self, **open_kwargs):
+        """Return file contents as text.
+        **open_kwargs : Any
+            Extra keyword arguments forwarded to ``open()`` (e.g. encoding).
+        """
+        with self.open(**open_kwargs) as stream:
             return stream.read()
-    def save(self, destination: str, client_config: Optional[dict] = None):
+    def save(self, destination: str, client_config: dict | None = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
@@ -744,8 +881,8 @@ class ImageFile(File):
     def save(  # type: ignore[override]
         self,
         destination: str,
-        format: Optional[str] = None,
-        client_config: Optional[dict] = None,
+        format: str | None = None,
+        client_config: dict | None = None,
     ):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
@@ -827,7 +964,7 @@ class VideoFile(File):
     def get_frames(
         self,
         start: int = 0,
-        end: Optional[int] = None,
+        end: int | None = None,
         step: int = 1,
     ) -> "Iterator[VideoFrame]":
         """
@@ -877,7 +1014,7 @@ class VideoFile(File):
         self,
         duration: float,
         start: float = 0,
-        end: Optional[float] = None,
+        end: float | None = None,
     ) -> "Iterator[VideoFragment]":
         """
         Splits the video into multiple fragments of a specified duration.
@@ -963,7 +1100,7 @@ class AudioFile(File):
         self,
         duration: float,
         start: float = 0,
-        end: Optional[float] = None,
+        end: float | None = None,
     ) -> "Iterator[AudioFragment]":
         """
         Splits the audio into multiple fragments of a specified duration.
@@ -1001,10 +1138,10 @@ class AudioFile(File):
     def save(  # type: ignore[override]
         self,
         output: str,
-        format: Optional[str] = None,
+        format: str | None = None,
         start: float = 0,
-        end: Optional[float] = None,
-        client_config: Optional[dict] = None,
+        end: float | None = None,
+        client_config: dict | None = None,
     ) -> "AudioFile":
         """Save audio file or extract fragment to specified format.
@@ -1075,7 +1212,7 @@ class AudioFragment(DataModel):
         duration = self.end - self.start
         return audio_to_bytes(self.audio, format, self.start, duration)
-    def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
+    def save(self, output: str, format: str | None = None) -> "AudioFile":
         """
         Saves the audio fragment as a new audio file.
@@ -1178,7 +1315,7 @@ class VideoFragment(DataModel):
     start: float
     end: float
-    def save(self, output: str, format: Optional[str] = None) -> "VideoFile":
+    def save(self, output: str, format: str | None = None) -> "VideoFile":
         """
         Saves the video fragment as a new video file.

datachain/lib/hf.py CHANGED Viewed

@@ -26,7 +26,7 @@ except ImportError as exc:
     ) from exc
 from io import BytesIO
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, TypeAlias
 import PIL
 from tqdm.auto import tqdm
@@ -41,7 +41,9 @@ if TYPE_CHECKING:
     from pydantic import BaseModel
-HFDatasetType = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]
+HFDatasetType: TypeAlias = (
+    str | DatasetDict | Dataset | IterableDatasetDict | IterableDataset
+)
 class HFClassLabel(DataModel):
@@ -67,7 +69,7 @@ class HFAudio(DataModel):
 class HFGenerator(Generator):
     def __init__(
         self,
-        ds: Union[str, HFDatasetType],
+        ds: HFDatasetType,
         output_schema: type["BaseModel"],
         limit: int = 0,
         *args,
@@ -117,7 +119,7 @@ class HFGenerator(Generator):
                 pbar.update(1)
-def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
+def stream_splits(ds: HFDatasetType, *args, **kwargs):
     if isinstance(ds, str):
         ds = load_dataset(ds, *args, **kwargs)
     if isinstance(ds, (DatasetDict, IterableDatasetDict)):
@@ -153,7 +155,7 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
 def get_output_schema(
-    features: Features, existing_column_names: Optional[list[str]] = None
+    features: Features, existing_column_names: list[str] | None = None
 ) -> tuple[dict[str, DataType], dict[str, str]]:
     """
     Generate UDF output schema from Hugging Face datasets features. It normalizes the

datachain/lib/image.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 import torch
 from PIL import Image as PILImage
@@ -6,7 +6,7 @@ from PIL import Image as PILImage
 from datachain.lib.file import File, FileError, Image, ImageFile
-def image_info(file: Union[File, ImageFile]) -> Image:
+def image_info(file: File | ImageFile) -> Image:
     """
     Returns image file information.
@@ -31,11 +31,11 @@ def image_info(file: Union[File, ImageFile]) -> Image:
 def convert_image(
     img: PILImage.Image,
     mode: str = "RGB",
-    size: Optional[tuple[int, int]] = None,
-    transform: Optional[Callable] = None,
-    encoder: Optional[Callable] = None,
-    device: Optional[Union[str, torch.device]] = None,
-) -> Union[PILImage.Image, torch.Tensor]:
+    size: tuple[int, int] | None = None,
+    transform: Callable | None = None,
+    encoder: Callable | None = None,
+    device: str | torch.device | None = None,
+) -> PILImage.Image | torch.Tensor:
     """
     Resize, transform, and otherwise convert an image.
@@ -71,13 +71,13 @@ def convert_image(
 def convert_images(
-    images: Union[PILImage.Image, list[PILImage.Image]],
+    images: PILImage.Image | list[PILImage.Image],
     mode: str = "RGB",
-    size: Optional[tuple[int, int]] = None,
-    transform: Optional[Callable] = None,
-    encoder: Optional[Callable] = None,
-    device: Optional[Union[str, torch.device]] = None,
-) -> Union[list[PILImage.Image], torch.Tensor]:
+    size: tuple[int, int] | None = None,
+    transform: Callable | None = None,
+    encoder: Callable | None = None,
+    device: str | torch.device | None = None,
+) -> list[PILImage.Image] | torch.Tensor:
     """
     Resize, transform, and otherwise convert one or more images.

datachain/lib/listing.py CHANGED Viewed

@@ -2,10 +2,10 @@ import glob
 import logging
 import os
 import posixpath
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from contextlib import contextmanager
 from datetime import datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, TypeVar
 from fsspec.asyn import get_loop
 from sqlalchemy.sql.expression import true
@@ -73,7 +73,7 @@ def get_file_info(uri: str, cache, client_config=None) -> File:
 def ls(
     dc: D,
     path: str,
-    recursive: Optional[bool] = True,
+    recursive: bool | None = True,
     column="file",
 ) -> D:
     """
@@ -150,8 +150,8 @@ def _reraise_as_client_error() -> Iterator[None]:
 def get_listing(
-    uri: Union[str, os.PathLike[str]], session: "Session", update: bool = False
-) -> tuple[Optional[str], str, str, bool]:
+    uri: str | os.PathLike[str], session: "Session", update: bool = False
+) -> tuple[str | None, str, str, bool]:
     """Returns correct listing dataset name that must be used for saving listing
     operation. It takes into account existing listings and reusability of those.
     It also returns boolean saying if returned dataset name is reused / already

datachain/lib/listing_info.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from datetime import datetime, timedelta, timezone
-from typing import Optional
 from datachain.client import Client
 from datachain.lib.dataset_info import DatasetInfo
@@ -17,7 +16,7 @@ class ListingInfo(DatasetInfo):
         return uri
     @property
-    def expires(self) -> Optional[datetime]:
+    def expires(self) -> datetime | None:
         if not self.finished_at:
             return None
         return self.finished_at + timedelta(seconds=LISTING_TTL)

datachain/lib/meta_formats.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import csv
-import json
 import tempfile
 import uuid
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from pathlib import Path
-from typing import Callable
 import jmespath as jsp
 from pydantic import BaseModel, ConfigDict, Field, ValidationError  # noqa: F401
+from datachain import json
 from datachain.lib.data_model import DataModel  # noqa: F401
 from datachain.lib.file import TextFile

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl