PyPI - datachain - Versions diffs - 0.8.12__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

datachain 0.8.12py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (34) hide show

datachain/__init__.py +10 -0
datachain/catalog/catalog.py +32 -9
datachain/cli/__init__.py +2 -0
datachain/cli/commands/datasets.py +78 -12
datachain/cli/parser/__init__.py +62 -12
datachain/cli/parser/job.py +14 -4
datachain/cli/parser/studio.py +8 -0
datachain/cli/parser/utils.py +20 -1
datachain/dataset.py +7 -4
datachain/diff/__init__.py +78 -128
datachain/fs/reference.py +21 -0
datachain/func/__init__.py +5 -2
datachain/func/array.py +39 -1
datachain/func/conditional.py +66 -2
datachain/job.py +1 -1
datachain/lib/arrow.py +1 -11
datachain/lib/dc.py +2 -0
datachain/lib/file.py +292 -5
datachain/lib/hf.py +1 -1
datachain/lib/video.py +223 -0
datachain/query/dataset.py +28 -3
datachain/remote/studio.py +13 -6
datachain/sql/functions/array.py +13 -1
datachain/sql/sqlite/base.py +17 -1
datachain/sql/sqlite/types.py +5 -0
datachain/studio.py +34 -12
datachain/utils.py +12 -2
{datachain-0.8.12.dist-info → datachain-0.9.0.dist-info}/METADATA +13 -5
{datachain-0.8.12.dist-info → datachain-0.9.0.dist-info}/RECORD +34 -32
/datachain/{lib/vfile.py → fs/__init__.py} +0 -0
{datachain-0.8.12.dist-info → datachain-0.9.0.dist-info}/LICENSE +0 -0
{datachain-0.8.12.dist-info → datachain-0.9.0.dist-info}/WHEEL +0 -0
{datachain-0.8.12.dist-info → datachain-0.9.0.dist-info}/entry_points.txt +0 -0
{datachain-0.8.12.dist-info → datachain-0.9.0.dist-info}/top_level.txt +0 -0

datachain/lib/file.py CHANGED Viewed

@@ -17,7 +17,7 @@ from urllib.parse import unquote, urlparse
 from urllib.request import url2pathname
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from PIL import Image
+from PIL import Image as PilImage
 from pydantic import Field, field_validator
 from datachain.client.fileslice import FileSlice
@@ -27,6 +27,7 @@ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
 from datachain.utils import TIME_ZERO
 if TYPE_CHECKING:
+    from numpy import ndarray
     from typing_extensions import Self
     from datachain.catalog import Catalog
@@ -40,7 +41,7 @@ logger = logging.getLogger("datachain")
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
-FileType = Literal["binary", "text", "image"]
+FileType = Literal["binary", "text", "image", "video"]
 class VFileError(DataChainError):
@@ -121,7 +122,21 @@ class VFileRegistry:
 class File(DataModel):
-    """`DataModel` for reading binary files."""
+    """
+    `DataModel` for reading binary files.
+    Attributes:
+        source (str): The source of the file (e.g., 's3://bucket-name/').
+        path (str): The path to the file (e.g., 'path/to/file.txt').
+        size (int): The size of the file in bytes. Defaults to 0.
+        version (str): The version of the file. Defaults to an empty string.
+        etag (str): The ETag of the file. Defaults to an empty string.
+        is_latest (bool): Whether the file is the latest version. Defaults to `True`.
+        last_modified (datetime): The last modified timestamp of the file.
+            Defaults to Unix epoch (`1970-01-01T00:00:00`).
+        location (dict | list[dict], optional): The location of the file.
+            Defaults to `None`.
+    """
     source: str = Field(default="")
     path: str
@@ -193,7 +208,7 @@ class File(DataModel):
     @classmethod
     def upload(
         cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
-    ) -> "File":
+    ) -> "Self":
         if catalog is None:
             from datachain.catalog.loader import get_catalog
@@ -203,6 +218,8 @@ class File(DataModel):
         client = catalog.get_client(parent)
         file = client.upload(data, name)
+        if not isinstance(file, cls):
+            file = cls(**file.model_dump())
         file._set_stream(catalog)
         return file
@@ -486,13 +503,281 @@ class ImageFile(File):
     def read(self):
         """Returns `PIL.Image.Image` object."""
         fobj = super().read()
-        return Image.open(BytesIO(fobj))
+        return PilImage.open(BytesIO(fobj))
     def save(self, destination: str):
         """Writes it's content to destination"""
         self.read().save(destination)
+class Image(DataModel):
+    """
+    A data model representing metadata for an image file.
+    Attributes:
+        width (int): The width of the image in pixels. Defaults to -1 if unknown.
+        height (int): The height of the image in pixels. Defaults to -1 if unknown.
+        format (str): The format of the image file (e.g., 'jpg', 'png').
+                      Defaults to an empty string.
+    """
+    width: int = Field(default=-1)
+    height: int = Field(default=-1)
+    format: str = Field(default="")
+class VideoFile(File):
+    """
+    A data model for handling video files.
+    This model inherits from the `File` model and provides additional functionality
+    for reading video files, extracting video frames, and splitting videos into
+    fragments.
+    """
+    def get_info(self) -> "Video":
+        """
+        Retrieves metadata and information about the video file.
+        Returns:
+            Video: A Model containing video metadata such as duration,
+                   resolution, frame rate, and codec details.
+        """
+        from .video import video_info
+        return video_info(self)
+    def get_frame(self, frame: int) -> "VideoFrame":
+        """
+        Returns a specific video frame by its frame number.
+        Args:
+            frame (int): The frame number to read.
+        Returns:
+            VideoFrame: Video frame model.
+        """
+        if frame < 0:
+            raise ValueError("frame must be a non-negative integer")
+        return VideoFrame(video=self, frame=frame)
+    def get_frames(
+        self,
+        start: int = 0,
+        end: Optional[int] = None,
+        step: int = 1,
+    ) -> "Iterator[VideoFrame]":
+        """
+        Returns video frames from the specified range in the video.
+        Args:
+            start (int): The starting frame number (default: 0).
+            end (int, optional): The ending frame number (exclusive). If None,
+                                 frames are read until the end of the video
+                                 (default: None).
+            step (int): The interval between frames to read (default: 1).
+        Returns:
+            Iterator[VideoFrame]: An iterator yielding video frames.
+        Note:
+            If end is not specified, number of frames will be taken from the video file,
+            this means video file needs to be downloaded.
+        """
+        from .video import validate_frame_range
+        start, end, step = validate_frame_range(self, start, end, step)
+        for frame in range(start, end, step):
+            yield self.get_frame(frame)
+    def get_fragment(self, start: float, end: float) -> "VideoFragment":
+        """
+        Returns a video fragment from the specified time range.
+        Args:
+            start (float): The start time of the fragment in seconds.
+            end (float): The end time of the fragment in seconds.
+        Returns:
+            VideoFragment: A Model representing the video fragment.
+        """
+        if start < 0 or end < 0 or start >= end:
+            raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+        return VideoFragment(video=self, start=start, end=end)
+    def get_fragments(
+        self,
+        duration: float,
+        start: float = 0,
+        end: Optional[float] = None,
+    ) -> "Iterator[VideoFragment]":
+        """
+        Splits the video into multiple fragments of a specified duration.
+        Args:
+            duration (float): The duration of each video fragment in seconds.
+            start (float): The starting time in seconds (default: 0).
+            end (float, optional): The ending time in seconds. If None, the entire
+                                   remaining video is processed (default: None).
+        Returns:
+            Iterator[VideoFragment]: An iterator yielding video fragments.
+        Note:
+            If end is not specified, number of frames will be taken from the video file,
+            this means video file needs to be downloaded.
+        """
+        if duration <= 0:
+            raise ValueError("duration must be a positive float")
+        if start < 0:
+            raise ValueError("start must be a non-negative float")
+        if end is None:
+            end = self.get_info().duration
+        if end < 0:
+            raise ValueError("end must be a non-negative float")
+        if start >= end:
+            raise ValueError("start must be less than end")
+        while start < end:
+            yield self.get_fragment(start, min(start + duration, end))
+            start += duration
+class VideoFrame(DataModel):
+    """
+    A data model for representing a video frame.
+    This model inherits from the `VideoFile` model and adds a `frame` attribute,
+    which represents a specific frame within a video file. It allows access
+    to individual frames and provides functionality for reading and saving
+    video frames as image files.
+    Attributes:
+        video (VideoFile): The video file containing the video frame.
+        frame (int): The frame number referencing a specific frame in the video file.
+    """
+    video: VideoFile
+    frame: int
+    def get_np(self) -> "ndarray":
+        """
+        Returns a video frame from the video file as a NumPy array.
+        Returns:
+            ndarray: A NumPy array representing the video frame,
+                     in the shape (height, width, channels).
+        """
+        from .video import video_frame_np
+        return video_frame_np(self.video, self.frame)
+    def read_bytes(self, format: str = "jpg") -> bytes:
+        """
+        Returns a video frame from the video file as image bytes.
+        Args:
+            format (str): The desired image format (e.g., 'jpg', 'png').
+                          Defaults to 'jpg'.
+        Returns:
+            bytes: The encoded video frame as image bytes.
+        """
+        from .video import video_frame_bytes
+        return video_frame_bytes(self.video, self.frame, format)
+    def save(self, output: str, format: str = "jpg") -> "ImageFile":
+        """
+        Saves the current video frame as an image file.
+        If `output` is a remote path, the image file will be uploaded to remote storage.
+        Args:
+            output (str): The destination path, which can be a local file path
+                          or a remote URL.
+            format (str): The image format (e.g., 'jpg', 'png'). Defaults to 'jpg'.
+        Returns:
+            ImageFile: A Model representing the saved image file.
+        """
+        from .video import save_video_frame
+        return save_video_frame(self.video, self.frame, output, format)
+class VideoFragment(DataModel):
+    """
+    A data model for representing a video fragment.
+    This model inherits from the `VideoFile` model and adds `start`
+    and `end` attributes, which represent a specific fragment within a video file.
+    It allows access to individual fragments and provides functionality for reading
+    and saving video fragments as separate video files.
+    Attributes:
+        video (VideoFile): The video file containing the video fragment.
+        start (float): The starting time of the video fragment in seconds.
+        end (float): The ending time of the video fragment in seconds.
+    """
+    video: VideoFile
+    start: float
+    end: float
+    def save(self, output: str, format: Optional[str] = None) -> "VideoFile":
+        """
+        Saves the video fragment as a new video file.
+        If `output` is a remote path, the video file will be uploaded to remote storage.
+        Args:
+            output (str): The destination path, which can be a local file path
+                          or a remote URL.
+            format (str, optional): The output video format (e.g., 'mp4', 'avi').
+                                    If None, the format is inferred from the
+                                    file extension.
+        Returns:
+            VideoFile: A Model representing the saved video file.
+        """
+        from .video import save_video_fragment
+        return save_video_fragment(self.video, self.start, self.end, output, format)
+class Video(DataModel):
+    """
+    A data model representing metadata for a video file.
+    Attributes:
+        width (int): The width of the video in pixels. Defaults to -1 if unknown.
+        height (int): The height of the video in pixels. Defaults to -1 if unknown.
+        fps (float): The frame rate of the video (frames per second).
+                     Defaults to -1.0 if unknown.
+        duration (float): The total duration of the video in seconds.
+                          Defaults to -1.0 if unknown.
+        frames (int): The total number of frames in the video.
+                      Defaults to -1 if unknown.
+        format (str): The format of the video file (e.g., 'mp4', 'avi').
+                      Defaults to an empty string.
+        codec (str): The codec used for encoding the video. Defaults to an empty string.
+    """
+    width: int = Field(default=-1)
+    height: int = Field(default=-1)
+    fps: float = Field(default=-1.0)
+    duration: float = Field(default=-1.0)
+    frames: int = Field(default=-1)
+    format: str = Field(default="")
+    codec: str = Field(default="")
 class ArrowRow(DataModel):
     """`DataModel` for reading row from Arrow-supported file."""
@@ -528,5 +813,7 @@ def get_file_type(type_: FileType = "binary") -> type[File]:
         file = TextFile
     elif type_ == "image":
         file = ImageFile  # type: ignore[assignment]
+    elif type_ == "video":
+        file = VideoFile
     return file

datachain/lib/hf.py CHANGED Viewed

@@ -20,7 +20,7 @@ try:
 except ImportError as exc:
     raise ImportError(
-        "Missing dependencies for huggingface datasets:\n"
+        "Missing dependencies for huggingface datasets.\n"
         "To install run:\n\n"
         "  pip install 'datachain[hf]'\n"
     ) from exc

datachain/lib/video.py ADDED Viewed

@@ -0,0 +1,223 @@
+import posixpath
+import shutil
+import tempfile
+from typing import Optional
+from numpy import ndarray
+from datachain.lib.file import FileError, ImageFile, Video, VideoFile
+try:
+    import ffmpeg
+    import imageio.v3 as iio
+except ImportError as exc:
+    raise ImportError(
+        "Missing dependencies for processing video.\n"
+        "To install run:\n\n"
+        "  pip install 'datachain[video]'\n"
+    ) from exc
+def video_info(file: VideoFile) -> Video:
+    """
+    Returns video file information.
+    Args:
+        file (VideoFile): Video file object.
+    Returns:
+        Video: Video file information.
+    """
+    if not (file_path := file.get_local_path()):
+        file.ensure_cached()
+        file_path = file.get_local_path()
+        if not file_path:
+            raise FileError(file, "unable to download video file")
+    try:
+        probe = ffmpeg.probe(file_path)
+    except Exception as exc:
+        raise FileError(file, "unable to extract metadata from video file") from exc
+    all_streams = probe.get("streams")
+    video_format = probe.get("format")
+    if not all_streams or not video_format:
+        raise FileError(file, "unable to extract metadata from video file")
+    video_streams = [s for s in all_streams if s["codec_type"] == "video"]
+    if len(video_streams) == 0:
+        raise FileError(file, "unable to extract metadata from video file")
+    video_stream = video_streams[0]
+    r_frame_rate = video_stream.get("r_frame_rate", "0")
+    if "/" in r_frame_rate:
+        num, denom = r_frame_rate.split("/")
+        fps = float(num) / float(denom)
+    else:
+        fps = float(r_frame_rate)
+    width = int(video_stream.get("width", 0))
+    height = int(video_stream.get("height", 0))
+    duration = float(video_format.get("duration", 0))
+    if "nb_frames" in video_stream:
+        frames = int(video_stream.get("nb_frames", 0))
+    else:
+        start_time = float(video_format.get("start_time", 0))
+        frames = int((duration - start_time) * fps)
+    format_name = video_format.get("format_name", "")
+    codec_name = video_stream.get("codec_name", "")
+    return Video(
+        width=width,
+        height=height,
+        fps=fps,
+        duration=duration,
+        frames=frames,
+        format=format_name,
+        codec=codec_name,
+    )
+def video_frame_np(video: VideoFile, frame: int) -> ndarray:
+    """
+    Reads video frame from a file and returns as numpy array.
+    Args:
+        video (VideoFile): Video file object.
+        frame (int): Frame index.
+    Returns:
+        ndarray: Video frame.
+    """
+    if frame < 0:
+        raise ValueError("frame must be a non-negative integer")
+    with video.open() as f:
+        return iio.imread(f, index=frame, plugin="pyav")  # type: ignore[arg-type]
+def validate_frame_range(
+    video: VideoFile,
+    start: int = 0,
+    end: Optional[int] = None,
+    step: int = 1,
+) -> tuple[int, int, int]:
+    """
+    Validates frame range for a video file.
+    Args:
+        video (VideoFile): Video file object.
+        start (int): Start frame index (default: 0).
+        end (int, optional): End frame index (default: None).
+        step (int): Step between frames (default: 1).
+    Returns:
+        tuple[int, int, int]: Start frame index, end frame index, and step.
+    """
+    if start < 0:
+        raise ValueError("start_frame must be a non-negative integer.")
+    if step < 1:
+        raise ValueError("step must be a positive integer.")
+    if end is None:
+        end = video_info(video).frames
+    if end < 0:
+        raise ValueError("end_frame must be a non-negative integer.")
+    if start > end:
+        raise ValueError("start_frame must be less than or equal to end_frame.")
+    return start, end, step
+def video_frame_bytes(video: VideoFile, frame: int, format: str = "jpg") -> bytes:
+    """
+    Reads video frame from a file and returns as image bytes.
+    Args:
+        video (VideoFile): Video file object.
+        frame (int): Frame index.
+        format (str): Image format (default: 'jpg').
+    Returns:
+        bytes: Video frame image as bytes.
+    """
+    img = video_frame_np(video, frame)
+    return iio.imwrite("<bytes>", img, extension=f".{format}")
+def save_video_frame(
+    video: VideoFile,
+    frame: int,
+    output: str,
+    format: str = "jpg",
+) -> ImageFile:
+    """
+    Saves video frame as a new image file. If output is a remote path,
+    the image file will be uploaded to the remote storage.
+    Args:
+        video (VideoFile): Video file object.
+        frame (int): Frame index.
+        output (str): Output path, can be a local path or a remote path.
+        format (str): Image format (default: 'jpg').
+    Returns:
+        ImageFile: Image file model.
+    """
+    img = video_frame_bytes(video, frame, format=format)
+    output_file = posixpath.join(
+        output, f"{video.get_file_stem()}_{frame:04d}.{format}"
+    )
+    return ImageFile.upload(img, output_file)
+def save_video_fragment(
+    video: VideoFile,
+    start: float,
+    end: float,
+    output: str,
+    format: Optional[str] = None,
+) -> VideoFile:
+    """
+    Saves video interval as a new video file. If output is a remote path,
+    the video file will be uploaded to the remote storage.
+    Args:
+        video (VideoFile): Video file object.
+        start (float): Start time in seconds.
+        end (float): End time in seconds.
+        output (str): Output path, can be a local path or a remote path.
+        format (str, optional): Output format (default: None). If not provided,
+                                the format will be inferred from the video fragment
+                                file extension.
+    Returns:
+        VideoFile: Video fragment model.
+    """
+    if start < 0 or end < 0 or start >= end:
+        raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+    if format is None:
+        format = video.get_file_ext()
+    start_ms = int(start * 1000)
+    end_ms = int(end * 1000)
+    output_file = posixpath.join(
+        output, f"{video.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
+    )
+    temp_dir = tempfile.mkdtemp()
+    try:
+        output_file_tmp = posixpath.join(temp_dir, posixpath.basename(output_file))
+        ffmpeg.input(
+            video.get_local_path(),
+            ss=start,
+            to=end,
+        ).output(output_file_tmp).run(quiet=True)
+        with open(output_file_tmp, "rb") as f:
+            return VideoFile.upload(f.read(), output_file)
+    finally:
+        shutil.rmtree(temp_dir)

datachain/query/dataset.py CHANGED Viewed

@@ -42,13 +42,17 @@ from datachain.data_storage.schema import (
     partition_col_names,
     partition_columns,
 )
-from datachain.dataset import DatasetStatus, RowDict
-from datachain.error import DatasetNotFoundError, QueryScriptCancelError
+from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
+from datachain.error import (
+    DatasetNotFoundError,
+    QueryScriptCancelError,
+)
 from datachain.func.base import Function
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
 from datachain.query.session import Session
+from datachain.remote.studio import is_token_set
 from datachain.sql.functions.random import rand
 from datachain.utils import (
     batched,
@@ -1081,6 +1085,7 @@ class DatasetQuery:
         session: Optional[Session] = None,
         indexing_column_types: Optional[dict[str, Any]] = None,
         in_memory: bool = False,
+        fallback_to_remote: bool = True,
     ) -> None:
         self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
         self.catalog = catalog or self.session.catalog
@@ -1097,7 +1102,12 @@ class DatasetQuery:
         self.column_types: Optional[dict[str, Any]] = None
         self.name = name
-        ds = self.catalog.get_dataset(name)
+        if fallback_to_remote and is_token_set():
+            ds = self.catalog.get_dataset_with_remote_fallback(name, version)
+        else:
+            ds = self.catalog.get_dataset(name)
         self.version = version or ds.latest_version
         self.feature_schema = ds.get_version(self.version).feature_schema
         self.column_types = copy(ds.schema)
@@ -1112,6 +1122,21 @@ class DatasetQuery:
     def __or__(self, other):
         return self.union(other)
+    def pull_dataset(self, name: str, version: Optional[int] = None) -> "DatasetRecord":
+        print("Dataset not found in local catalog, trying to get from studio")
+        remote_ds_uri = f"{DATASET_PREFIX}{name}"
+        if version:
+            remote_ds_uri += f"@v{version}"
+        self.catalog.pull_dataset(
+            remote_ds_uri=remote_ds_uri,
+            local_ds_name=name,
+            local_ds_version=version,
+        )
+        return self.catalog.get_dataset(name)
     @staticmethod
     def get_table() -> "TableClause":
         table_name = "".join(

datachain 0.8.12__py3-none-any.whl → 0.9.0__py3-none-any.whl

Potentially problematic release.

datachain 0.8.12py3-none-any.whl → 0.9.0py3-none-any.whl