PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/file.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import errno
 import hashlib
 import io
-import json
 import logging
 import os
 import posixpath
+import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from contextlib import contextmanager
 from datetime import datetime
 from functools import partial
 from io import BytesIO
-from pathlib import Path, PurePosixPath
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+from pathlib import Path, PurePath, PurePosixPath
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
@@ -20,9 +20,10 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from fsspec.utils import stringify_path
 from pydantic import Field, field_validator
+from datachain import json
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
-from datachain.lib.utils import DataChainError
+from datachain.lib.utils import DataChainError, rebase_path
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.sql.types import JSON, Boolean, DateTime, Int, String
 from datachain.utils import TIME_ZERO
@@ -34,15 +35,16 @@ if TYPE_CHECKING:
     from datachain.catalog import Catalog
     from datachain.client.fsspec import Client
     from datachain.dataset import RowDict
+    from datachain.query.session import Session
 sha256 = partial(hashlib.sha256, usedforsecurity=False)
 logger = logging.getLogger("datachain")
 # how to create file path when exporting
-ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
+ExportPlacement = Literal["filename", "etag", "fullpath", "checksum", "filepath"]
-FileType = Literal["binary", "text", "image", "video"]
+FileType = Literal["binary", "text", "image", "video", "audio"]
 EXPORT_FILES_MAX_THREADS = 5
@@ -51,12 +53,12 @@ class FileExporter(NodesThreadPool):
     def __init__(
         self,
-        output: Union[str, os.PathLike[str]],
+        output: str | os.PathLike[str],
         placement: ExportPlacement,
         use_cache: bool,
         link_type: Literal["copy", "symlink"],
         max_threads: int = EXPORT_FILES_MAX_THREADS,
-        client_config: Optional[dict] = None,
+        client_config: dict | None = None,
     ):
         super().__init__(max_threads)
         self.output = output
@@ -69,7 +71,7 @@ class FileExporter(NodesThreadPool):
         for task in done:
             task.result()
-    def do_task(self, file):
+    def do_task(self, file: "File"):
         file.export(
             self.output,
             self.placement,
@@ -81,14 +83,28 @@ class FileExporter(NodesThreadPool):
 class VFileError(DataChainError):
-    def __init__(self, file: "File", message: str, vtype: str = ""):
+    def __init__(self, message: str, source: str, path: str, vtype: str = ""):
+        self.message = message
+        self.source = source
+        self.path = path
+        self.vtype = vtype
         type_ = f" of vtype '{vtype}'" if vtype else ""
-        super().__init__(f"Error in v-file '{file.path}'{type_}: {message}")
+        super().__init__(f"Error in v-file '{source}/{path}'{type_}: {message}")
+    def __reduce__(self):
+        return self.__class__, (self.message, self.source, self.path, self.vtype)
 class FileError(DataChainError):
-    def __init__(self, file: "File", message: str):
-        super().__init__(f"Error in file {file.get_uri()}: {message}")
+    def __init__(self, message: str, source: str, path: str):
+        self.message = message
+        self.source = source
+        self.path = path
+        super().__init__(f"Error in file '{source}/{path}': {message}")
+    def __reduce__(self):
+        return self.__class__, (self.message, self.source, self.path)
 class VFile(ABC):
@@ -113,26 +129,36 @@ class TarVFile(VFile):
     @classmethod
     def open(cls, file: "File", location: list[dict]):
         """Stream file from tar archive based on location in archive."""
-        if len(location) > 1:
-            raise VFileError(file, "multiple 'location's are not supported yet")
+        tar_file = cls.parent(file, location)
         loc = location[0]
         if (offset := loc.get("offset", None)) is None:
-            raise VFileError(file, "'offset' is not specified")
+            raise VFileError("'offset' is not specified", file.source, file.path)
         if (size := loc.get("size", None)) is None:
-            raise VFileError(file, "'size' is not specified")
+            raise VFileError("'size' is not specified", file.source, file.path)
+        client = file._catalog.get_client(tar_file.source)
+        fd = client.open_object(tar_file, use_cache=file._caching_enabled)
+        return FileSlice(fd, offset, size, file.name)
+    @classmethod
+    def parent(cls, file: "File", location: list[dict]) -> "File":
+        if len(location) > 1:
+            raise VFileError(
+                "multiple 'location's are not supported yet", file.source, file.path
+            )
+        loc = location[0]
         if (parent := loc.get("parent", None)) is None:
-            raise VFileError(file, "'parent' is not specified")
+            raise VFileError("'parent' is not specified", file.source, file.path)
         tar_file = File(**parent)
         tar_file._set_stream(file._catalog)
-        client = file._catalog.get_client(tar_file.source)
-        fd = client.open_object(tar_file, use_cache=file._caching_enabled)
-        return FileSlice(fd, offset, size, file.name)
+        return tar_file
 class VFileRegistry:
@@ -143,19 +169,33 @@ class VFileRegistry:
         cls._vtype_readers[reader.get_vtype()] = reader
     @classmethod
-    def resolve(cls, file: "File", location: list[dict]):
+    def _get_reader(cls, file: "File", location: list[dict]):
         if len(location) == 0:
-            raise VFileError(file, "'location' must not be list of JSONs")
+            raise VFileError(
+                "'location' must not be list of JSONs", file.source, file.path
+            )
         if not (vtype := location[0].get("vtype", "")):
-            raise VFileError(file, "vtype is not specified")
+            raise VFileError("vtype is not specified", file.source, file.path)
         reader = cls._vtype_readers.get(vtype, None)
         if not reader:
-            raise VFileError(file, "reader not registered", vtype)
+            raise VFileError(
+                "reader not registered", file.source, file.path, vtype=vtype
+            )
+        return reader
+    @classmethod
+    def open(cls, file: "File", location: list[dict]):
+        reader = cls._get_reader(file, location)
         return reader.open(file, location)
+    @classmethod
+    def parent(cls, file: "File", location: list[dict]) -> "File":
+        reader = cls._get_reader(file, location)
+        return reader.parent(file, location)
 class File(DataModel):
     """
@@ -181,7 +221,7 @@ class File(DataModel):
     etag: str = Field(default="")
     is_latest: bool = Field(default=True)
     last_modified: datetime = Field(default=TIME_ZERO)
-    location: Optional[Union[dict, list[dict]]] = Field(default=None)
+    location: dict | list[dict] | None = Field(default=None)
     _datachain_column_types: ClassVar[dict[str, Any]] = {
         "source": String,
@@ -213,10 +253,19 @@ class File(DataModel):
         "last_modified",
     ]
+    # Allowed kwargs we forward to TextIOWrapper
+    _TEXT_WRAPPER_ALLOWED: ClassVar[tuple[str, ...]] = (
+        "encoding",
+        "errors",
+        "newline",
+        "line_buffering",
+        "write_through",
+    )
     @staticmethod
     def _validate_dict(
-        v: Optional[Union[str, dict, list[dict]]],
-    ) -> Optional[Union[str, dict, list[dict]]]:
+        v: str | dict | list[dict] | None,
+    ) -> str | dict | list[dict] | None:
         if v is None or v == "":
             return None
         if isinstance(v, str):
@@ -236,8 +285,8 @@ class File(DataModel):
     @field_validator("path", mode="before")
     @classmethod
-    def validate_path(cls, path):
-        return Path(path).as_posix()
+    def validate_path(cls, path: str) -> str:
+        return PurePath(path).as_posix() if path else ""
     def model_dump_custom(self):
         res = self.model_dump()
@@ -248,6 +297,16 @@ class File(DataModel):
         super().__init__(**kwargs)
         self._catalog = None
         self._caching_enabled: bool = False
+        self._download_cb: Callback = DEFAULT_CALLBACK
+    def __getstate__(self):
+        state = super().__getstate__()
+        # Exclude _catalog from pickling - it contains SQLAlchemy engine and other
+        # non-picklable objects. The catalog will be re-set by _set_stream() on the
+        # worker side when needed.
+        state["__dict__"] = state["__dict__"].copy()
+        state["__dict__"]["_catalog"] = None
+        return state
     def as_text_file(self) -> "TextFile":
         """Convert the file to a `TextFile` object."""
@@ -273,19 +332,31 @@ class File(DataModel):
         file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
         return file
+    def as_audio_file(self) -> "AudioFile":
+        """Convert the file to a `AudioFile` object."""
+        if isinstance(self, AudioFile):
+            return self
+        file = AudioFile(**self.model_dump())
+        file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
+        return file
     @classmethod
     def upload(
-        cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
+        cls,
+        data: bytes,
+        path: str | os.PathLike[str],
+        catalog: "Catalog | None" = None,
     ) -> "Self":
         if catalog is None:
-            from datachain.catalog.loader import get_catalog
-            catalog = get_catalog()
+            from datachain.query.session import Session
+            catalog = Session.get().catalog
         from datachain.client.fsspec import Client
-        client_cls = Client.get_implementation(path)
-        source, rel_path = client_cls.split_url(path)
+        path_str = stringify_path(path)
+        client_cls = Client.get_implementation(path_str)
+        source, rel_path = client_cls.split_url(path_str)
         client = catalog.get_client(client_cls.get_uri(source))
         file = client.upload(data, rel_path)
@@ -294,49 +365,150 @@ class File(DataModel):
         file._set_stream(catalog)
         return file
+    @classmethod
+    def at(
+        cls, uri: str | os.PathLike[str], session: "Session | None" = None
+    ) -> "Self":
+        """Construct a File from a full URI in one call.
+        Example:
+            file = File.at("s3://bucket/path/to/output.png")
+            with file.open("wb") as f: ...
+        """
+        from datachain.client.fsspec import Client
+        from datachain.query.session import Session
+        if session is None:
+            session = Session.get()
+        catalog = session.catalog
+        uri_str = stringify_path(uri)
+        if uri_str.endswith(("/", os.sep)):
+            raise ValueError(
+                f"File.at directory URL/path given (trailing slash), got: {uri_str}"
+            )
+        client_cls = Client.get_implementation(uri_str)
+        uri_str = client_cls.path_to_uri(uri_str)
+        source, rel_path = client_cls.split_url(uri_str)
+        source_uri = client_cls.get_uri(source)
+        file = cls(source=source_uri, path=rel_path)
+        file._set_stream(catalog)
+        return file
     @classmethod
     def _from_row(cls, row: "RowDict") -> "Self":
         return cls(**{key: row[key] for key in cls._datachain_column_types})
     @property
-    def name(self):
+    def name(self) -> str:
         return PurePosixPath(self.path).name
     @property
-    def parent(self):
+    def parent(self) -> str:
         return str(PurePosixPath(self.path).parent)
     @contextmanager
-    def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
-        """Open the file and return a file object."""
-        if self.location:
-            with VFileRegistry.resolve(self, self.location) as f:  # type: ignore[arg-type]
-                yield f
+    def open(
+        self,
+        mode: str = "rb",
+        *,
+        client_config: dict[str, Any] | None = None,
+        **open_kwargs,
+    ) -> Iterator[Any]:
+        """Open the file and return a file-like object.
+        Supports both read ("rb", "r") and write modes (e.g. "wb", "w", "ab").
+        When opened in a write mode, metadata is refreshed after closing.
+        """
+        writing = any(ch in mode for ch in "wax+")
+        if self.location and writing:
+            raise VFileError(
+                "Writing to virtual file is not supported",
+                self.source,
+                self.path,
+            )
-        else:
+        if self._catalog is None:
+            raise RuntimeError("Cannot open file: catalog is not set")
+        base_cfg = getattr(self._catalog, "client_config", {}) or {}
+        merged_cfg = {**base_cfg, **(client_config or {})}
+        client: Client = self._catalog.get_client(self.source, **merged_cfg)
+        if not writing:
+            if self.location:
+                with VFileRegistry.open(self, self.location) as f:  # type: ignore[arg-type]
+                    yield self._wrap_text(f, mode, open_kwargs)
+                return
             if self._caching_enabled:
                 self.ensure_cached()
-            client: Client = self._catalog.get_client(self.source)
             with client.open_object(
                 self, use_cache=self._caching_enabled, cb=self._download_cb
             ) as f:
-                yield io.TextIOWrapper(f) if mode == "r" else f
+                yield self._wrap_text(f, mode, open_kwargs)
+            return
+        # write path
+        full_path = client.get_full_path(self.get_path_normalized())
+        with client.fs.open(full_path, mode, **open_kwargs) as f:
+            yield self._wrap_text(f, mode, open_kwargs)
+        version_hint = self._extract_write_version(f)
+        # refresh metadata pinned to the version that was just written
+        refreshed = client.get_file_info(
+            self.get_path_normalized(), version_id=version_hint
+        )
+        for k, v in refreshed.model_dump().items():
+            setattr(self, k, v)
+    def _wrap_text(self, f: Any, mode: str, open_kwargs: dict[str, Any]) -> Any:
+        """Return stream possibly wrapped for text."""
+        if "b" in mode or isinstance(f, io.TextIOBase):
+            return f
+        filtered = {
+            k: open_kwargs[k] for k in self._TEXT_WRAPPER_ALLOWED if k in open_kwargs
+        }
+        return io.TextIOWrapper(f, **filtered)
+    def _extract_write_version(self, handle: Any) -> str | None:
+        """Best-effort extraction of object version after a write.
+        S3 (s3fs) and Azure (adlfs) populate version_id on the handle.
+        GCS (gcsfs) populates generation. Azure and GCS require upstream
+        fixes to be released.
+        """
+        for attr in ("version_id", "generation"):
+            if value := getattr(handle, attr, None):
+                return value
+        return None
     def read_bytes(self, length: int = -1):
         """Returns file contents as bytes."""
         with self.open() as stream:
             return stream.read(length)
-    def read_text(self):
-        """Returns file contents as text."""
-        with self.open(mode="r") as stream:
+    def read_text(self, **open_kwargs):
+        """Return file contents decoded as text.
+        **open_kwargs : Any
+            Extra keyword arguments forwarded to ``open(mode="r", ...)``
+            (e.g. ``encoding="utf-8"``, ``errors="ignore"``)
+        """
+        if self.location:
+            raise VFileError(
+                "Reading text from virtual file is not supported",
+                self.source,
+                self.path,
+            )
+        with self.open(mode="r", **open_kwargs) as stream:
             return stream.read()
     def read(self, length: int = -1):
         """Returns file contents."""
         return self.read_bytes(length)
-    def save(self, destination: str, client_config: Optional[dict] = None):
+    def save(self, destination: str, client_config: dict | None = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
         client: Client = self._catalog.get_client(destination, **(client_config or {}))
@@ -346,7 +518,7 @@ class File(DataModel):
         client.upload(self.read(), destination)
-    def _symlink_to(self, destination: str):
+    def _symlink_to(self, destination: str) -> None:
         if self.location:
             raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
@@ -355,7 +527,7 @@ class File(DataModel):
             source = self.get_local_path()
             assert source, "File was not cached"
         elif self.source.startswith("file://"):
-            source = self.get_path()
+            source = self.get_fs_path()
         else:
             raise OSError(errno.EXDEV, "can't link across filesystems")
@@ -363,11 +535,11 @@ class File(DataModel):
     def export(
         self,
-        output: Union[str, os.PathLike[str]],
+        output: str | os.PathLike[str],
         placement: ExportPlacement = "fullpath",
         use_cache: bool = True,
         link_type: Literal["copy", "symlink"] = "copy",
-        client_config: Optional[dict] = None,
+        client_config: dict | None = None,
     ) -> None:
         """Export file to new location."""
         self._caching_enabled = use_cache
@@ -403,18 +575,22 @@ class File(DataModel):
         client = self._catalog.get_client(self.source)
         client.download(self, callback=self._download_cb)
-    async def _prefetch(self, download_cb: Optional["Callback"] = None) -> bool:
+    async def _prefetch(self, download_cb: "Callback | None" = None) -> bool:
         if self._catalog is None:
             raise RuntimeError("cannot prefetch file because catalog is not setup")
+        file = self
+        if self.location:
+            file = VFileRegistry.parent(self, self.location)  # type: ignore[arg-type]
         client = self._catalog.get_client(self.source)
-        await client._download(self, callback=download_cb or self._download_cb)
-        self._set_stream(
+        await client._download(file, callback=download_cb or self._download_cb)
+        file._set_stream(
             self._catalog, caching_enabled=True, download_cb=DEFAULT_CALLBACK
         )
         return True
-    def get_local_path(self) -> Optional[str]:
+    def get_local_path(self) -> str | None:
         """Return path to a file in a local cache.
         Returns None if file is not cached.
@@ -432,31 +608,66 @@ class File(DataModel):
     def get_file_ext(self):
         """Returns last part of file name without `.`."""
-        return PurePosixPath(self.path).suffix.strip(".")
+        return PurePosixPath(self.path).suffix.lstrip(".")
     def get_file_stem(self):
         """Returns file name without extension."""
         return PurePosixPath(self.path).stem
     def get_full_name(self):
-        """Returns name with parent directories."""
+        """
+        [DEPRECATED] Use `file.path` directly instead.
+        Returns name with parent directories.
+        """
+        warnings.warn(
+            "file.get_full_name() is deprecated and will be removed "
+            "in a future version. Use `file.path` directly.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         return self.path
-    def get_uri(self):
+    def get_path_normalized(self) -> str:
+        if not self.path:
+            raise FileError("path must not be empty", self.source, self.path)
+        if self.path.endswith("/"):
+            raise FileError("path must not be a directory", self.source, self.path)
+        normpath = os.path.normpath(self.path)
+        normpath = PurePath(normpath).as_posix()
+        if normpath == ".":
+            raise FileError("path must not be a directory", self.source, self.path)
+        if any(part == ".." for part in PurePath(normpath).parts):
+            raise FileError("path must not contain '..'", self.source, self.path)
+        return normpath
+    def get_uri(self) -> str:
         """Returns file URI."""
-        return f"{self.source}/{self.get_full_name()}"
+        return f"{self.source}/{self.get_path_normalized()}"
+    def get_fs_path(self) -> str:
+        """
+        Returns file path with respect to the filescheme.
+        If `normalize` is True, the path is normalized to remove any redundant
+        separators and up-level references.
-    def get_path(self) -> str:
-        """Returns file path."""
+        If the file scheme is "file", the path is converted to a local file path
+        using `url2pathname`. Otherwise, the original path with scheme is returned.
+        """
         path = unquote(self.get_uri())
-        source = urlparse(self.source)
-        if source.scheme == "file":
-            path = urlparse(path).path
-            path = url2pathname(path)
+        path_parsed = urlparse(path)
+        if path_parsed.scheme == "file":
+            path = url2pathname(path_parsed.path)
         return path
     def get_destination_path(
-        self, output: Union[str, os.PathLike[str]], placement: ExportPlacement
+        self, output: str | os.PathLike[str], placement: ExportPlacement
     ) -> str:
         """
         Returns full destination path of a file for exporting to some output
@@ -467,10 +678,12 @@ class File(DataModel):
         elif placement == "etag":
             path = f"{self.etag}{self.get_file_suffix()}"
         elif placement == "fullpath":
-            path = unquote(self.get_full_name())
+            path = unquote(self.get_path_normalized())
             source = urlparse(self.source)
             if source.scheme and source.scheme != "file":
                 path = posixpath.join(source.netloc, path)
+        elif placement == "filepath":
+            path = unquote(self.get_path_normalized())
         elif placement == "checksum":
             raise NotImplementedError("Checksum placement not implemented yet")
         else:
@@ -505,9 +718,10 @@ class File(DataModel):
             ) from e
         try:
-            info = client.fs.info(client.get_full_path(self.path))
-            converted_info = client.info_to_file(info, self.path)
-            return type(self)(
+            normalized_path = self.get_path_normalized()
+            info = client.fs.info(client.get_full_path(normalized_path))
+            converted_info = client.info_to_file(info, normalized_path)
+            res = type(self)(
                 path=self.path,
                 source=self.source,
                 size=converted_info.size,
@@ -517,10 +731,21 @@ class File(DataModel):
                 last_modified=converted_info.last_modified,
                 location=self.location,
             )
+            res._set_stream(self._catalog)
+            return res
+        except FileError as e:
+            logger.warning(
+                "File error when resolving %s/%s: %s", self.source, self.path, str(e)
+            )
         except (FileNotFoundError, PermissionError, OSError) as e:
-            logger.warning("File system error when resolving %s: %s", self.path, str(e))
+            logger.warning(
+                "File system error when resolving %s/%s: %s",
+                self.source,
+                self.path,
+                str(e),
+            )
-        return type(self)(
+        res = type(self)(
             path=self.path,
             source=self.source,
             size=0,
@@ -530,10 +755,48 @@ class File(DataModel):
             last_modified=TIME_ZERO,
             location=self.location,
         )
+        res._set_stream(self._catalog)
+        return res
+    def rebase(
+        self,
+        old_base: str,
+        new_base: str,
+        suffix: str = "",
+        extension: str = "",
+    ) -> str:
+        """
+        Rebase the file's URI from one base directory to another.
+        Args:
+            old_base: Base directory to remove from the file's URI
+            new_base: New base directory to prepend
+            suffix: Optional suffix to add before file extension
+            extension: Optional new file extension (without dot)
+        Returns:
+            str: Rebased URI with new base directory
+        Raises:
+            ValueError: If old_base is not found in the file's URI
+        Examples:
+            >>> file = File(source="s3://bucket", path="data/2025-05-27/file.wav")
+            >>> file.rebase("s3://bucket/data", "s3://output-bucket/processed", \
+                    extension="mp3")
+            's3://output-bucket/processed/2025-05-27/file.mp3'
+            >>> file.rebase("data/audio", "/local/output", suffix="_ch1",
+                    extension="npy")
+            '/local/output/file_ch1.npy'
+        """
+        return rebase_path(self.get_uri(), old_base, new_base, suffix, extension)
 def resolve(file: File) -> File:
     """
+    [DEPRECATED] Use `file.resolve()` directly instead.
     Resolve a File object by checking its existence and updating its metadata.
     This function is a wrapper around the File.resolve() method, designed to be
@@ -549,6 +812,12 @@ def resolve(file: File) -> File:
         RuntimeError: If the file's catalog is not set or if
         the file source protocol is unsupported.
     """
+    warnings.warn(
+        "resolve() is deprecated and will be removed "
+        "in a future version. Use file.resolve() directly.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     return file.resolve()
@@ -556,17 +825,30 @@ class TextFile(File):
     """`DataModel` for reading text files."""
     @contextmanager
-    def open(self, mode: Literal["rb", "r"] = "r"):
-        """Open the file and return a file object (default to text mode)."""
-        with super().open(mode=mode) as stream:
+    def open(
+        self,
+        mode: str = "r",
+        *,
+        client_config: dict[str, Any] | None = None,
+        **open_kwargs,
+    ) -> Iterator[Any]:
+        """Open the file and return a file-like object.
+        Default to text mode"""
+        with super().open(
+            mode=mode, client_config=client_config, **open_kwargs
+        ) as stream:
             yield stream
-    def read_text(self):
-        """Returns file contents as text."""
-        with self.open() as stream:
+    def read_text(self, **open_kwargs):
+        """Return file contents as text.
+        **open_kwargs : Any
+            Extra keyword arguments forwarded to ``open()`` (e.g. encoding).
+        """
+        with self.open(**open_kwargs) as stream:
             return stream.read()
-    def save(self, destination: str, client_config: Optional[dict] = None):
+    def save(self, destination: str, client_config: dict | None = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
@@ -599,13 +881,30 @@ class ImageFile(File):
     def save(  # type: ignore[override]
         self,
         destination: str,
-        format: Optional[str] = None,
-        client_config: Optional[dict] = None,
+        format: str | None = None,
+        client_config: dict | None = None,
     ):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
         client: Client = self._catalog.get_client(destination, **(client_config or {}))
+        # If format is not provided, determine it from the file extension
+        if format is None:
+            from pathlib import PurePosixPath
+            from PIL import Image as PilImage
+            ext = PurePosixPath(destination).suffix.lower()
+            format = PilImage.registered_extensions().get(ext)
+        if not format:
+            raise FileError(
+                f"Can't determine format for destination '{destination}'",
+                self.source,
+                self.path,
+            )
         with client.fs.open(destination, mode="wb") as f:
             self.read().save(f, format=format)
@@ -665,7 +964,7 @@ class VideoFile(File):
     def get_frames(
         self,
         start: int = 0,
-        end: Optional[int] = None,
+        end: int | None = None,
         step: int = 1,
     ) -> "Iterator[VideoFrame]":
         """
@@ -704,7 +1003,10 @@ class VideoFile(File):
             VideoFragment: A Model representing the video fragment.
         """
         if start < 0 or end < 0 or start >= end:
-            raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+            raise ValueError(
+                f"Can't get video fragment for '{self.path}', "
+                f"invalid time range: ({start:.3f}, {end:.3f})"
+            )
         return VideoFragment(video=self, start=start, end=end)
@@ -712,7 +1014,7 @@ class VideoFile(File):
         self,
         duration: float,
         start: float = 0,
-        end: Optional[float] = None,
+        end: float | None = None,
     ) -> "Iterator[VideoFragment]":
         """
         Splits the video into multiple fragments of a specified duration.
@@ -748,6 +1050,189 @@ class VideoFile(File):
             start += duration
+class AudioFile(File):
+    """
+    A data model for handling audio files.
+    This model inherits from the `File` model and provides additional functionality
+    for reading audio files, extracting audio fragments, and splitting audio into
+    fragments.
+    """
+    def get_info(self) -> "Audio":
+        """
+        Retrieves metadata and information about the audio file. It does not
+        download the file if possible, only reads its header. It is thus might be
+        a good idea to disable caching and prefetching for UDF if you only need
+        audio metadata.
+        Returns:
+            Audio: A Model containing audio metadata such as duration,
+                   sample rate, channels, and codec details.
+        """
+        from .audio import audio_info
+        return audio_info(self)
+    def get_fragment(self, start: float, end: float) -> "AudioFragment":
+        """
+        Returns an audio fragment from the specified time range. It does not
+        download the file, neither it actually extracts the fragment. It returns
+        a Model representing the audio fragment, which can be used to read or save
+        it later.
+        Args:
+            start (float): The start time of the fragment in seconds.
+            end (float): The end time of the fragment in seconds.
+        Returns:
+            AudioFragment: A Model representing the audio fragment.
+        """
+        if start < 0 or end < 0 or start >= end:
+            raise ValueError(
+                f"Can't get audio fragment for '{self.path}', "
+                f"invalid time range: ({start:.3f}, {end:.3f})"
+            )
+        return AudioFragment(audio=self, start=start, end=end)
+    def get_fragments(
+        self,
+        duration: float,
+        start: float = 0,
+        end: float | None = None,
+    ) -> "Iterator[AudioFragment]":
+        """
+        Splits the audio into multiple fragments of a specified duration.
+        Args:
+            duration (float): The duration of each audio fragment in seconds.
+            start (float): The starting time in seconds (default: 0).
+            end (float, optional): The ending time in seconds. If None, the entire
+                                   remaining audio is processed (default: None).
+        Returns:
+            Iterator[AudioFragment]: An iterator yielding audio fragments.
+        Note:
+            If end is not specified, number of samples will be taken from the
+            audio file, this means audio file needs to be downloaded.
+        """
+        if duration <= 0:
+            raise ValueError("duration must be a positive float")
+        if start < 0:
+            raise ValueError("start must be a non-negative float")
+        if end is None:
+            end = self.get_info().duration
+        if end < 0:
+            raise ValueError("end must be a non-negative float")
+        if start >= end:
+            raise ValueError("start must be less than end")
+        while start < end:
+            yield self.get_fragment(start, min(start + duration, end))
+            start += duration
+    def save(  # type: ignore[override]
+        self,
+        output: str,
+        format: str | None = None,
+        start: float = 0,
+        end: float | None = None,
+        client_config: dict | None = None,
+    ) -> "AudioFile":
+        """Save audio file or extract fragment to specified format.
+        Args:
+            output: Output directory path
+            format: Output format ('wav', 'mp3', etc). Defaults to source format
+            start: Start time in seconds (>= 0). Defaults to 0
+            end: End time in seconds. If None, extracts to end of file
+            client_config: Optional client configuration
+        Returns:
+            AudioFile: New audio file with format conversion/extraction applied
+        Examples:
+            audio.save("/path", "mp3")                        # Entire file to MP3
+            audio.save("s3://bucket/path", "wav", start=2.5)  # From 2.5s to end as WAV
+            audio.save("/path", "flac", start=1, end=3)       # 1-3s fragment as FLAC
+        """
+        from .audio import save_audio
+        return save_audio(self, output, format, start, end)
+class AudioFragment(DataModel):
+    """
+    A data model for representing an audio fragment.
+    This model represents a specific fragment within an audio file with defined
+    start and end times. It allows access to individual fragments and provides
+    functionality for reading and saving audio fragments as separate audio files.
+    Attributes:
+        audio (AudioFile): The audio file containing the audio fragment.
+        start (float): The starting time of the audio fragment in seconds.
+        end (float): The ending time of the audio fragment in seconds.
+    """
+    audio: AudioFile
+    start: float
+    end: float
+    def get_np(self) -> tuple["ndarray", int]:
+        """
+        Returns the audio fragment as a NumPy array with sample rate.
+        Returns:
+            tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
+                               and the sample rate.
+        """
+        from .audio import audio_to_np
+        duration = self.end - self.start
+        return audio_to_np(self.audio, self.start, duration)
+    def read_bytes(self, format: str = "wav") -> bytes:
+        """
+        Returns the audio fragment as audio bytes.
+        Args:
+            format (str): The desired audio format (e.g., 'wav', 'mp3').
+                         Defaults to 'wav'.
+        Returns:
+            bytes: The encoded audio fragment as bytes.
+        """
+        from .audio import audio_to_bytes
+        duration = self.end - self.start
+        return audio_to_bytes(self.audio, format, self.start, duration)
+    def save(self, output: str, format: str | None = None) -> "AudioFile":
+        """
+        Saves the audio fragment as a new audio file.
+        If `output` is a remote path, the audio file will be uploaded to remote storage.
+        Args:
+            output (str): The destination path, which can be a local file path
+                          or a remote URL.
+            format (str, optional): The output audio format (e.g., 'wav', 'mp3').
+                                    If None, the format is inferred from the
+                                    file extension.
+        Returns:
+            AudioFile: A Model representing the saved audio file.
+        """
+        from .audio import save_audio
+        return save_audio(self.audio, output, format, self.start, self.end)
 class VideoFrame(DataModel):
     """
     A data model for representing a video frame.
@@ -830,7 +1315,7 @@ class VideoFragment(DataModel):
     start: float
     end: float
-    def save(self, output: str, format: Optional[str] = None) -> "VideoFile":
+    def save(self, output: str, format: str | None = None) -> "VideoFile":
         """
         Saves the video fragment as a new video file.
@@ -878,6 +1363,52 @@ class Video(DataModel):
     codec: str = Field(default="")
+class Audio(DataModel):
+    """
+    A data model representing metadata for an audio file.
+    Attributes:
+        sample_rate (int): The sample rate of the audio (samples per second).
+                          Defaults to -1 if unknown.
+        channels (int): The number of audio channels. Defaults to -1 if unknown.
+        duration (float): The total duration of the audio in seconds.
+                         Defaults to -1.0 if unknown.
+        samples (int): The total number of samples in the audio.
+                      Defaults to -1 if unknown.
+        format (str): The format of the audio file (e.g., 'wav', 'mp3').
+                     Defaults to an empty string.
+        codec (str): The codec used for encoding the audio. Defaults to an empty string.
+        bit_rate (int): The bit rate of the audio in bits per second.
+                       Defaults to -1 if unknown.
+    """
+    sample_rate: int = Field(default=-1)
+    channels: int = Field(default=-1)
+    duration: float = Field(default=-1.0)
+    samples: int = Field(default=-1)
+    format: str = Field(default="")
+    codec: str = Field(default="")
+    bit_rate: int = Field(default=-1)
+    @staticmethod
+    def get_channel_name(num_channels: int, channel_idx: int) -> str:
+        """Map channel index to meaningful name based on common audio formats"""
+        channel_mappings = {
+            1: ["Mono"],
+            2: ["Left", "Right"],
+            4: ["W", "X", "Y", "Z"],  # First-order Ambisonics
+            6: ["FL", "FR", "FC", "LFE", "BL", "BR"],  # 5.1 surround
+            8: ["FL", "FR", "FC", "LFE", "BL", "BR", "SL", "SR"],  # 7.1 surround
+        }
+        if num_channels in channel_mappings:
+            channels = channel_mappings[num_channels]
+            if 0 <= channel_idx < len(channels):
+                return channels[channel_idx]
+        return f"Ch{channel_idx + 1}"
 class ArrowRow(DataModel):
     """`DataModel` for reading row from Arrow-supported file."""
@@ -896,7 +1427,7 @@ class ArrowRow(DataModel):
             ds = dataset(path, **self.kwargs)
         else:
-            path = self.file.get_path()
+            path = self.file.get_fs_path()
             ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
         return ds.take([self.index]).to_reader()
@@ -915,5 +1446,7 @@ def get_file_type(type_: FileType = "binary") -> type[File]:
         file = ImageFile  # type: ignore[assignment]
     elif type_ == "video":
         file = VideoFile
+    elif type_ == "audio":
+        file = AudioFile
     return file

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl