PyPI - datachain - Versions diffs - 0.11.11__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

datachain 0.11.11py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (34) hide show

datachain/catalog/catalog.py +33 -5
datachain/catalog/loader.py +19 -13
datachain/cli/__init__.py +2 -1
datachain/cli/parser/studio.py +13 -1
datachain/client/fsspec.py +12 -16
datachain/client/hf.py +36 -14
datachain/client/local.py +1 -4
datachain/data_storage/warehouse.py +3 -8
datachain/dataset.py +8 -0
datachain/error.py +0 -12
datachain/fs/utils.py +30 -0
datachain/func/__init__.py +5 -0
datachain/func/func.py +2 -1
datachain/lib/dc.py +23 -8
datachain/lib/file.py +55 -17
datachain/lib/image.py +30 -6
datachain/lib/listing.py +21 -39
datachain/lib/video.py +7 -5
datachain/model/bbox.py +209 -58
datachain/model/pose.py +49 -37
datachain/model/segment.py +22 -18
datachain/model/ultralytics/bbox.py +9 -9
datachain/model/ultralytics/pose.py +7 -7
datachain/model/ultralytics/segment.py +7 -7
datachain/model/utils.py +191 -0
datachain/query/dataset.py +4 -2
datachain/studio.py +8 -6
datachain/utils.py +0 -16
{datachain-0.11.11.dist-info → datachain-0.12.0.dist-info}/METADATA +4 -2
{datachain-0.11.11.dist-info → datachain-0.12.0.dist-info}/RECORD +34 -32
{datachain-0.11.11.dist-info → datachain-0.12.0.dist-info}/WHEEL +1 -1
{datachain-0.11.11.dist-info → datachain-0.12.0.dist-info}/LICENSE +0 -0
{datachain-0.11.11.dist-info → datachain-0.12.0.dist-info}/entry_points.txt +0 -0
{datachain-0.11.11.dist-info → datachain-0.12.0.dist-info}/top_level.txt +0 -0

datachain/lib/file.py CHANGED Viewed

@@ -18,7 +18,6 @@ from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from fsspec.utils import stringify_path
-from PIL import Image as PilImage
 from pydantic import Field, field_validator
 from datachain.client.fileslice import FileSlice
@@ -52,7 +51,7 @@ class FileExporter(NodesThreadPool):
     def __init__(
         self,
-        output: str,
+        output: Union[str, os.PathLike[str]],
         placement: ExportPlacement,
         use_cache: bool,
         link_type: Literal["copy", "symlink"],
@@ -243,6 +242,30 @@ class File(DataModel):
         self._catalog = None
         self._caching_enabled: bool = False
+    def as_text_file(self) -> "TextFile":
+        """Convert the file to a `TextFile` object."""
+        if isinstance(self, TextFile):
+            return self
+        file = TextFile(**self.model_dump())
+        file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
+        return file
+    def as_image_file(self) -> "ImageFile":
+        """Convert the file to a `ImageFile` object."""
+        if isinstance(self, ImageFile):
+            return self
+        file = ImageFile(**self.model_dump())
+        file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
+        return file
+    def as_video_file(self) -> "VideoFile":
+        """Convert the file to a `VideoFile` object."""
+        if isinstance(self, VideoFile):
+            return self
+        file = VideoFile(**self.model_dump())
+        file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
+        return file
     @classmethod
     def upload(
         cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
@@ -292,20 +315,20 @@ class File(DataModel):
             ) as f:
                 yield io.TextIOWrapper(f) if mode == "r" else f
-    def read(self, length: int = -1):
-        """Returns file contents."""
+    def read_bytes(self, length: int = -1):
+        """Returns file contents as bytes."""
         with self.open() as stream:
             return stream.read(length)
-    def read_bytes(self):
-        """Returns file contents as bytes."""
-        return self.read()
     def read_text(self):
         """Returns file contents as text."""
         with self.open(mode="r") as stream:
             return stream.read()
+    def read(self, length: int = -1):
+        """Returns file contents."""
+        return self.read_bytes(length)
     def save(self, destination: str, client_config: Optional[dict] = None):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
@@ -333,7 +356,7 @@ class File(DataModel):
     def export(
         self,
-        output: str,
+        output: Union[str, os.PathLike[str]],
         placement: ExportPlacement = "fullpath",
         use_cache: bool = True,
         link_type: Literal["copy", "symlink"] = "copy",
@@ -374,15 +397,10 @@ class File(DataModel):
         client.download(self, callback=self._download_cb)
     async def _prefetch(self, download_cb: Optional["Callback"] = None) -> bool:
-        from datachain.client.hf import HfClient
         if self._catalog is None:
             raise RuntimeError("cannot prefetch file because catalog is not setup")
         client = self._catalog.get_client(self.source)
-        if client.protocol == HfClient.protocol:
-            return False
         await client._download(self, callback=download_cb or self._download_cb)
         self._set_stream(
             self._catalog, caching_enabled=True, download_cb=DEFAULT_CALLBACK
@@ -430,7 +448,9 @@ class File(DataModel):
             path = url2pathname(path)
         return path
-    def get_destination_path(self, output: str, placement: ExportPlacement) -> str:
+    def get_destination_path(
+        self, output: Union[str, os.PathLike[str]], placement: ExportPlacement
+    ) -> str:
         """
         Returns full destination path of a file for exporting to some output
         based on export placement
@@ -551,18 +571,36 @@ class TextFile(File):
 class ImageFile(File):
     """`DataModel` for reading image files."""
+    def get_info(self) -> "Image":
+        """
+        Retrieves metadata and information about the image file.
+        Returns:
+            Image: A Model containing image metadata such as width, height and format.
+        """
+        from .image import image_info
+        return image_info(self)
     def read(self):
         """Returns `PIL.Image.Image` object."""
+        from PIL import Image as PilImage
         fobj = super().read()
         return PilImage.open(BytesIO(fobj))
-    def save(self, destination: str, client_config: Optional[dict] = None):
+    def save(  # type: ignore[override]
+        self,
+        destination: str,
+        format: Optional[str] = None,
+        client_config: Optional[dict] = None,
+    ):
         """Writes it's content to destination"""
         destination = stringify_path(destination)
         client: Client = self._catalog.get_client(destination, **(client_config or {}))
         with client.fs.open(destination, mode="wb") as f:
-            self.read().save(f)
+            self.read().save(f, format=format)
 class Image(DataModel):

datachain/lib/image.py CHANGED Viewed

@@ -1,17 +1,41 @@
 from typing import Callable, Optional, Union
 import torch
-from PIL import Image
+from PIL import Image as PILImage
+from datachain.lib.file import File, FileError, Image, ImageFile
+def image_info(file: Union[File, ImageFile]) -> Image:
+    """
+    Returns image file information.
+    Args:
+        file (ImageFile): Image file object.
+    Returns:
+        Image: Image file information.
+    """
+    try:
+        img = file.as_image_file().read()
+    except Exception as exc:
+        raise FileError(file, "unable to open image file") from exc
+    return Image(
+        width=img.width,
+        height=img.height,
+        format=img.format or "",
+    )
 def convert_image(
-    img: Image.Image,
+    img: PILImage.Image,
     mode: str = "RGB",
     size: Optional[tuple[int, int]] = None,
     transform: Optional[Callable] = None,
     encoder: Optional[Callable] = None,
     device: Optional[Union[str, torch.device]] = None,
-) -> Union[Image.Image, torch.Tensor]:
+) -> Union[PILImage.Image, torch.Tensor]:
     """
     Resize, transform, and otherwise convert an image.
@@ -47,13 +71,13 @@ def convert_image(
 def convert_images(
-    images: Union[Image.Image, list[Image.Image]],
+    images: Union[PILImage.Image, list[PILImage.Image]],
     mode: str = "RGB",
     size: Optional[tuple[int, int]] = None,
     transform: Optional[Callable] = None,
     encoder: Optional[Callable] = None,
     device: Optional[Union[str, torch.device]] = None,
-) -> Union[list[Image.Image], torch.Tensor]:
+) -> Union[list[PILImage.Image], torch.Tensor]:
     """
     Resize, transform, and otherwise convert one or more images.
@@ -65,7 +89,7 @@ def convert_images(
         encoder (Callable): Encode image using model.
         device (str or torch.device): Device to use.
     """
-    if isinstance(images, Image.Image):
+    if isinstance(images, PILImage.Image):
         images = [images]
     converted = [

datachain/lib/listing.py CHANGED Viewed

@@ -1,19 +1,21 @@
+import glob
 import logging
 import os
 import posixpath
 from collections.abc import Iterator
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 from fsspec.asyn import get_loop
 from sqlalchemy.sql.expression import true
+import datachain.fs.utils as fsutils
 from datachain.asyn import iter_over_async
 from datachain.client import Client
-from datachain.error import REMOTE_ERRORS, ClientError
+from datachain.error import ClientError
 from datachain.lib.file import File
 from datachain.query.schema import Column
 from datachain.sql.functions import path as pathfunc
-from datachain.telemetry import telemetry
 from datachain.utils import uses_glob
 if TYPE_CHECKING:
@@ -92,38 +94,6 @@ def ls(
     return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
-def _isfile(client: "Client", path: str) -> bool:
-    """
-    Returns True if uri points to a file
-    """
-    try:
-        if "://" in path:
-            # This makes sure that the uppercase scheme is converted to lowercase
-            scheme, path = path.split("://", 1)
-            path = f"{scheme.lower()}://{path}"
-        if os.name == "nt" and "*" in path:
-            # On Windows, the glob pattern "*" is not supported
-            return False
-        info = client.fs.info(path)
-        name = info.get("name")
-        # case for special simulated directories on some clouds
-        # e.g. Google creates a zero byte file with the same name as the
-        # directory with a trailing slash at the end
-        if not name or name.endswith("/"):
-            return False
-        return info["type"] == "file"
-    except FileNotFoundError:
-        return False
-    except REMOTE_ERRORS as e:
-        raise ClientError(
-            message=str(e),
-            error_code=getattr(e, "code", None),
-        ) from e
 def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
     """
     Parsing uri and returns listing dataset name, listing uri and listing path
@@ -156,8 +126,16 @@ def listing_uri_from_name(dataset_name: str) -> str:
     return dataset_name.removeprefix(LISTING_PREFIX)
+@contextmanager
+def _reraise_as_client_error() -> Iterator[None]:
+    try:
+        yield
+    except Exception as e:
+        raise ClientError(message=str(e), error_code=getattr(e, "code", None)) from e
 def get_listing(
-    uri: str, session: "Session", update: bool = False
+    uri: Union[str, os.PathLike[str]], session: "Session", update: bool = False
 ) -> tuple[Optional[str], str, str, bool]:
     """Returns correct listing dataset name that must be used for saving listing
     operation. It takes into account existing listings and reusability of those.
@@ -167,6 +145,7 @@ def get_listing(
     be used to find rows based on uri.
     """
     from datachain.client.local import FileClient
+    from datachain.telemetry import telemetry
     catalog = session.catalog
     cache = catalog.cache
@@ -174,11 +153,14 @@ def get_listing(
     client = Client.get_client(uri, cache, **client_config)
     telemetry.log_param("client", client.PREFIX)
+    if not isinstance(uri, str):
+        uri = os.fspath(uri)
     # we don't want to use cached dataset (e.g. for a single file listing)
-    if not uri.endswith("/") and _isfile(client, uri):
-        storage_uri, path = Client.parse_url(uri)
-        return None, f"{storage_uri}/{path.lstrip('/')}", path, False
+    isfile = _reraise_as_client_error()(fsutils.isfile)
+    if not glob.has_magic(uri) and not uri.endswith("/") and isfile(client.fs, uri):
+        _, path = Client.parse_url(uri)
+        return None, uri, path, False
     ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
     listing = None

datachain/lib/video.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import posixpath
 import shutil
 import tempfile
-from typing import Optional
+from typing import Optional, Union
 from numpy import ndarray
-from datachain.lib.file import FileError, ImageFile, Video, VideoFile
+from datachain.lib.file import File, FileError, ImageFile, Video, VideoFile
 try:
     import ffmpeg
@@ -18,7 +18,7 @@ except ImportError as exc:
     ) from exc
-def video_info(file: VideoFile) -> Video:
+def video_info(file: Union[File, VideoFile]) -> Video:
     """
     Returns video file information.
@@ -28,6 +28,8 @@ def video_info(file: VideoFile) -> Video:
     Returns:
         Video: Video file information.
     """
+    file = file.as_video_file()
     if not (file_path := file.get_local_path()):
         file.ensure_cached()
         file_path = file.get_local_path()
@@ -170,7 +172,7 @@ def save_video_frame(
     output_file = posixpath.join(
         output, f"{video.get_file_stem()}_{frame:04d}.{format}"
     )
-    return ImageFile.upload(img, output_file)
+    return ImageFile.upload(img, output_file, catalog=video._catalog)
 def save_video_fragment(
@@ -218,6 +220,6 @@ def save_video_fragment(
         ).output(output_file_tmp).run(quiet=True)
         with open(output_file_tmp, "rb") as f:
-            return VideoFile.upload(f.read(), output_file)
+            return VideoFile.upload(f.read(), output_file, catalog=video._catalog)
     finally:
         shutil.rmtree(temp_dir)

datachain/model/bbox.py CHANGED Viewed

@@ -1,47 +1,216 @@
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Union
 from pydantic import Field
 from datachain.lib.data_model import DataModel
+from .utils import convert_bbox, validate_bbox
+if TYPE_CHECKING:
+    from .pose import Pose, Pose3D
 class BBox(DataModel):
     """
-    A data model for representing bounding box.
+    A data model representing a bounding box.
     Attributes:
-        title (str): The title of the bounding box.
-        coords (list[int]): The coordinates of the bounding box.
+        title (str): The title or label associated with the bounding box.
+        coords (list[int]): A list of four bounding box coordinates.
-    The bounding box is defined by two points:
-        - (x1, y1): The top-left corner of the box.
-        - (x2, y2): The bottom-right corner of the box.
+    The bounding box follows the PASCAL VOC format, where:
+        - (x1, y1) represents the pixel coordinates of the top-left corner.
+        - (x2, y2) represents the pixel coordinates of the bottom-right corner.
     """
     title: str = Field(default="")
     coords: list[int] = Field(default=[])
     @staticmethod
-    def from_list(coords: list[float], title: str = "") -> "BBox":
-        assert len(coords) == 4, "Bounding box must be a list of 4 coordinates."
-        assert all(isinstance(value, (int, float)) for value in coords), (
-            "Bounding box coordinates must be floats or integers."
-        )
-        return BBox(
-            title=title,
-            coords=[round(c) for c in coords],
+    def from_albumentations(
+        coords: Sequence[float],
+        img_size: Sequence[int],
+        title: str = "",
+    ) -> "BBox":
+        """
+        Create a bounding box from Albumentations format.
+        Albumentations represents bounding boxes as `[x_min, y_min, x_max, y_max]`
+        with normalized coordinates (values between 0 and 1) relative to the image size.
+        Args:
+            coords (Sequence[float]): The bounding box coordinates in
+                Albumentations format.
+            img_size (Sequence[int]): The reference image size as `[width, height]`.
+            title (str, optional): The title or label of the bounding box.
+                Defaults to an empty string.
+        Returns:
+            BBox: The bounding box data model.
+        """
+        validate_bbox(coords, float)
+        bbox_coords = convert_bbox(coords, img_size, "albumentations", "voc")
+        return BBox(title=title, coords=list(map(round, bbox_coords)))
+    def to_albumentations(self, img_size: Sequence[int]) -> list[float]:
+        """
+        Convert the bounding box coordinates to Albumentations format.
+        Albumentations represents bounding boxes as `[x_min, y_min, x_max, y_max]`
+        with normalized coordinates (values between 0 and 1) relative to the image size.
+        Args:
+            img_size (Sequence[int]): The reference image size as `[width, height]`.
+        Returns:
+            list[float]: The bounding box coordinates in Albumentations format.
+        """
+        return convert_bbox(self.coords, img_size, "voc", "albumentations")
+    @staticmethod
+    def from_coco(
+        coords: Sequence[float],
+        title: str = "",
+    ) -> "BBox":
+        """
+        Create a bounding box from COCO format.
+        COCO format represents bounding boxes as [x_min, y_min, width, height], where:
+        - (x_min, y_min) are the pixel coordinates of the top-left corner.
+        - width and height define the size of the bounding box in pixels.
+        Args:
+            coords (Sequence[float]): The bounding box coordinates in COCO format.
+            title (str): The title of the bounding box.
+        Returns:
+            BBox: The bounding box data model.
+        """
+        validate_bbox(coords, float, int)
+        bbox_coords = convert_bbox(coords, [], "coco", "voc")
+        return BBox(title=title, coords=list(map(round, bbox_coords)))
+    def to_coco(self) -> list[int]:
+        """
+        Return the bounding box coordinates in COCO format.
+        COCO format represents bounding boxes as [x_min, y_min, width, height], where:
+        - (x_min, y_min) are the pixel coordinates of the top-left corner.
+        - width and height define the size of the bounding box in pixels.
+        Returns:
+            list[int]: The bounding box coordinates in COCO format.
+        """
+        res = convert_bbox(self.coords, [], "voc", "coco")
+        return list(map(round, res))
+    @staticmethod
+    def from_voc(
+        coords: Sequence[float],
+        title: str = "",
+    ) -> "BBox":
+        """
+        Create a bounding box from PASCAL VOC format.
+        PASCAL VOC format represents bounding boxes as [x_min, y_min, x_max, y_max],
+        where:
+        - (x_min, y_min) are the pixel coordinates of the top-left corner.
+        - (x_max, y_max) are the pixel coordinates of the bottom-right corner.
+        Args:
+            coords (Sequence[float]): The bounding box coordinates in VOC format.
+            title (str): The title of the bounding box.
+        Returns:
+            BBox: The bounding box data model.
+        """
+        validate_bbox(coords, float, int)
+        return BBox(title=title, coords=list(map(round, coords)))
+    def to_voc(self) -> list[int]:
+        """
+        Return the bounding box coordinates in PASCAL VOC format.
+        PASCAL VOC format represents bounding boxes as [x_min, y_min, x_max, y_max],
+        where:
+        - (x_min, y_min) are the pixel coordinates of the top-left corner.
+        - (x_max, y_max) are the pixel coordinates of the bottom-right corner.
+        Returns:
+            list[int]: The bounding box coordinates in VOC format.
+        """
+        return self.coords
+    @staticmethod
+    def from_yolo(
+        coords: Sequence[float],
+        img_size: Sequence[int],
+        title: str = "",
+    ) -> "BBox":
+        """
+        Create a bounding box from YOLO format.
+        YOLO format represents bounding boxes as [x_center, y_center, width, height],
+        where:
+        - (x_center, y_center) are the normalized coordinates of the box center.
+        - width and height normalized values define the size of the bounding box.
+        Args:
+            coords (Sequence[float]): The bounding box coordinates in YOLO format.
+            img_size (Sequence[int]): The reference image size as `[width, height]`.
+            title (str): The title of the bounding box.
+        Returns:
+            BBox: The bounding box data model.
+        """
+        validate_bbox(coords, float)
+        bbox_coords = convert_bbox(coords, img_size, "yolo", "voc")
+        return BBox(title=title, coords=list(map(round, bbox_coords)))
+    def to_yolo(self, img_size: Sequence[int]) -> list[float]:
+        """
+        Return the bounding box coordinates in YOLO format.
+        YOLO format represents bounding boxes as [x_center, y_center, width, height],
+        where:
+        - (x_center, y_center) are the normalized coordinates of the box center.
+        - width and height normalized values define the size of the bounding box.
+        Args:
+            img_size (Sequence[int]): The reference image size as `[width, height]`.
+        Returns:
+            list[float]: The bounding box coordinates in YOLO format.
+        """
+        return convert_bbox(self.coords, img_size, "voc", "yolo")
+    def point_inside(self, x: int, y: int) -> bool:
+        """
+        Return True if the point is inside the bounding box.
+        Assumes that if the point is on the edge of the bounding box,
+        it is considered inside.
+        """
+        x1, y1, x2, y2 = self.coords
+        return x1 <= x <= x2 and y1 <= y <= y2
+    def pose_inside(self, pose: Union["Pose", "Pose3D"]) -> bool:
+        """Return True if the pose is inside the bounding box."""
+        return all(
+            self.point_inside(x, y) for x, y in zip(pose.x, pose.y) if x > 0 or y > 0
         )
+    @staticmethod
+    def from_list(coords: Sequence[float], title: str = "") -> "BBox":
+        return BBox.from_voc(coords, title=title)
     @staticmethod
     def from_dict(coords: dict[str, float], title: str = "") -> "BBox":
-        assert isinstance(coords, dict) and set(coords) == {
-            "x1",
-            "y1",
-            "x2",
-            "y2",
-        }, "Bounding box must be a dictionary with keys 'x1', 'y1', 'x2' and 'y2'."
-        return BBox.from_list(
-            [coords["x1"], coords["y1"], coords["x2"], coords["y2"]],
-            title=title,
-        )
+        keys = ("x1", "y1", "x2", "y2")
+        if not isinstance(coords, dict) or set(coords) != set(keys):
+            raise ValueError("Bounding box must be a dictionary with coordinates.")
+        return BBox.from_voc([coords[k] for k in keys], title=title)
 class OBBox(DataModel):
@@ -63,40 +232,22 @@ class OBBox(DataModel):
     coords: list[int] = Field(default=[])
     @staticmethod
-    def from_list(coords: list[float], title: str = "") -> "OBBox":
-        assert len(coords) == 8, (
-            "Oriented bounding box must be a list of 8 coordinates."
-        )
-        assert all(isinstance(value, (int, float)) for value in coords), (
-            "Oriented bounding box coordinates must be floats or integers."
-        )
-        return OBBox(
-            title=title,
-            coords=[round(c) for c in coords],
-        )
+    def from_list(coords: Sequence[float], title: str = "") -> "OBBox":
+        if not isinstance(coords, (list, tuple)):
+            raise TypeError("Oriented bounding box must be a list of coordinates.")
+        if len(coords) != 8:
+            raise ValueError("Oriented bounding box must have 8 coordinates.")
+        if not all(isinstance(value, (int, float)) for value in coords):
+            raise ValueError(
+                "Oriented bounding box coordinates must be floats or integers."
+            )
+        return OBBox(title=title, coords=list(map(round, coords)))
     @staticmethod
     def from_dict(coords: dict[str, float], title: str = "") -> "OBBox":
-        assert isinstance(coords, dict) and set(coords) == {
-            "x1",
-            "y1",
-            "x2",
-            "y2",
-            "x3",
-            "y3",
-            "x4",
-            "y4",
-        }, "Oriented bounding box must be a dictionary with coordinates."
-        return OBBox.from_list(
-            [
-                coords["x1"],
-                coords["y1"],
-                coords["x2"],
-                coords["y2"],
-                coords["x3"],
-                coords["y3"],
-                coords["x4"],
-                coords["y4"],
-            ],
-            title=title,
-        )
+        keys = ("x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4")
+        if not isinstance(coords, dict) or set(coords) != set(keys):
+            raise ValueError(
+                "Oriented bounding box must be a dictionary with coordinates."
+            )
+        return OBBox.from_list([coords[k] for k in keys], title=title)

datachain 0.11.11__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

datachain 0.11.11py3-none-any.whl → 0.12.0py3-none-any.whl