PyPI - datachain - Versions diffs - 0.25.1__py3-none-any.whl → 0.26.0__py3-none-any.whl - Mend

datachain 0.25.1py3-none-any.whl → 0.26.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (16) hide show

datachain/__init__.py +6 -0
datachain/lib/audio.py +151 -0
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/dc/datachain.py +125 -23
datachain/lib/dc/datasets.py +1 -1
datachain/lib/file.py +190 -1
datachain/lib/model_store.py +8 -0
datachain/lib/signal_schema.py +47 -7
datachain/lib/udf.py +17 -5
datachain/query/dataset.py +15 -9
{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/METADATA +6 -2
{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/RECORD +16 -15
{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/WHEEL +0 -0
{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/entry_points.txt +0 -0
{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -21,6 +21,9 @@ from datachain.lib.dc import (
 )
 from datachain.lib.file import (
     ArrowRow,
+    Audio,
+    AudioFile,
+    AudioFragment,
     File,
     FileError,
     Image,
@@ -43,6 +46,9 @@ __all__ = [
     "AbstractUDF",
     "Aggregator",
     "ArrowRow",
+    "Audio",
+    "AudioFile",
+    "AudioFragment",
     "C",
     "Column",
     "DataChain",

datachain/lib/audio.py ADDED Viewed

@@ -0,0 +1,151 @@
+import posixpath
+from typing import TYPE_CHECKING, Optional, Union
+from datachain.lib.file import FileError
+if TYPE_CHECKING:
+    from numpy import ndarray
+    from datachain.lib.file import Audio, AudioFile, File
+try:
+    import torchaudio
+except ImportError as exc:
+    raise ImportError(
+        "Missing dependencies for processing audio.\n"
+        "To install run:\n\n"
+        "  pip install 'datachain[audio]'\n"
+    ) from exc
+def audio_info(file: "Union[File, AudioFile]") -> "Audio":
+    """Extract metadata like sample rate, channels, duration, and format."""
+    from datachain.lib.file import Audio
+    file = file.as_audio_file()
+    try:
+        with file.open() as f:
+            info = torchaudio.info(f)
+            sample_rate = int(info.sample_rate)
+            channels = int(info.num_channels)
+            frames = int(info.num_frames)
+            duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
+            # Get format information
+            format_name = getattr(info, "format", "")
+            codec_name = getattr(info, "encoding", "")
+            bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
+    except Exception as exc:
+        raise FileError(
+            "unable to extract metadata from audio file", file.source, file.path
+        ) from exc
+    return Audio(
+        sample_rate=sample_rate,
+        channels=channels,
+        duration=duration,
+        samples=frames,
+        format=format_name,
+        codec=codec_name,
+        bit_rate=bit_rate,
+    )
+def audio_fragment_np(
+    audio: "AudioFile", start: float = 0, duration: Optional[float] = None
+) -> "tuple[ndarray, int]":
+    """Load audio fragment as numpy array.
+    Multi-channel audio is transposed to (samples, channels)."""
+    if start < 0:
+        raise ValueError("start must be a non-negative float")
+    if duration is not None and duration <= 0:
+        raise ValueError("duration must be a positive float")
+    if hasattr(audio, "as_audio_file"):
+        audio = audio.as_audio_file()
+    try:
+        with audio.open() as f:
+            info = torchaudio.info(f)
+            sample_rate = info.sample_rate
+            frame_offset = int(start * sample_rate)
+            num_frames = int(duration * sample_rate) if duration is not None else -1
+            # Reset file pointer to the beginning
+            # This is important to ensure we read from the correct position later
+            f.seek(0)
+            waveform, sr = torchaudio.load(
+                f, frame_offset=frame_offset, num_frames=num_frames
+            )
+            audio_np = waveform.numpy()
+            if audio_np.shape[0] > 1:
+                audio_np = audio_np.T
+            else:
+                audio_np = audio_np.squeeze()
+            return audio_np, int(sr)
+    except Exception as exc:
+        raise FileError(
+            "unable to read audio fragment", audio.source, audio.path
+        ) from exc
+def audio_fragment_bytes(
+    audio: "AudioFile",
+    start: float = 0,
+    duration: Optional[float] = None,
+    format: str = "wav",
+) -> bytes:
+    """Convert audio fragment to bytes using soundfile."""
+    y, sr = audio_fragment_np(audio, start, duration)
+    import io
+    import soundfile as sf
+    buffer = io.BytesIO()
+    sf.write(buffer, y, sr, format=format)
+    return buffer.getvalue()
+def save_audio_fragment(
+    audio: "AudioFile",
+    start: float,
+    end: float,
+    output: str,
+    format: Optional[str] = None,
+) -> "AudioFile":
+    """Save audio fragment with timestamped filename.
+    Supports local and remote storage upload."""
+    if start < 0 or end < 0 or start >= end:
+        raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+    if format is None:
+        format = audio.get_file_ext()
+    duration = end - start
+    start_ms = int(start * 1000)
+    end_ms = int(end * 1000)
+    output_file = posixpath.join(
+        output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
+    )
+    try:
+        audio_bytes = audio_fragment_bytes(audio, start, duration, format)
+        from datachain.lib.file import AudioFile
+        return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
+    except Exception as exc:
+        raise FileError(
+            "unable to save audio fragment", audio.source, audio.path
+        ) from exc

datachain/lib/convert/sql_to_python.py CHANGED Viewed

@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
         type_ = sql_exp.type.python_type
         if type_ == Decimal:
             type_ = float
+        elif type_ is list:
+            if hasattr(sql_exp.type, "item_type") and hasattr(
+                sql_exp.type.item_type, "python_type"
+            ):
+                item_type = getattr(sql_exp.type.item_type, "python_type", Any)
+                type_ = list[item_type]  # type: ignore[valid-type]
+            else:
+                type_ = list
     except NotImplementedError:
         type_ = str
     return type_

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import (
     Optional,
     TypeVar,
     Union,
+    cast,
     overload,
 )
@@ -39,14 +40,15 @@ from datachain.lib.file import (
     FileExporter,
 )
 from datachain.lib.file import ExportPlacement as FileExportPlacement
+from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
-from datachain.lib.signal_schema import SignalSchema
+from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
 from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
 from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import DataChainColumnError, DataChainParamsError
 from datachain.query import Session
 from datachain.query.dataset import DatasetQuery, PartitionByType
-from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
+from datachain.query.schema import DEFAULT_DELIMITER, Column
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
@@ -758,11 +760,12 @@ class DataChain:
     @delta_disabled
     def agg(
         self,
+        /,
         func: Optional[Callable] = None,
         partition_by: Optional[PartitionByType] = None,
         params: Union[None, str, Sequence[str]] = None,
         output: OutputType = None,
-        **signal_map,
+        **signal_map: Callable,
     ) -> "Self":
         """Aggregate rows using `partition_by` statement and apply a function to the
         groups of aggregated rows. The function needs to return new objects for each
@@ -772,12 +775,28 @@ class DataChain:
         This method bears similarity to `gen()` and `map()`, employing a comparable set
         of parameters, yet differs in two crucial aspects:
         1. The `partition_by` parameter: This specifies the column name or a list of
            column names that determine the grouping criteria for aggregation.
         2. Group-based UDF function input: Instead of individual rows, the function
-           receives a list all rows within each group defined by `partition_by`.
+           receives a list of all rows within each group defined by `partition_by`.
+        If `partition_by` is not set or is an empty list, all rows will be placed
+        into a single group.
+        Parameters:
+            func: Function applied to each group of rows.
+            partition_by: Column name(s) to group by. If None, all rows go
+                into one group.
+            params: List of column names used as input for the function. Default is
+                taken from function signature.
+            output: Dictionary defining new signals and their corresponding types.
+                Default type is taken from function signature.
+            **signal_map: kwargs can be used to define `func` together with its return
+                signal name in format of `agg(result_column=my_func)`.
         Examples:
+            Basic aggregation with lambda function:
             ```py
             chain = chain.agg(
                 total=lambda category, amount: [sum(amount)],
@@ -788,7 +807,6 @@ class DataChain:
             ```
             An alternative syntax, when you need to specify a more complex function:
             ```py
             # It automatically resolves which columns to pass to the function
             # by looking at the function signature.
@@ -806,10 +824,43 @@ class DataChain:
             )
             chain.save("new_dataset")
             ```
+            Using complex signals for partitioning (`File` or any Pydantic `BaseModel`):
+            ```py
+            def my_agg(files: list[File]) -> Iterator[tuple[File, int]]:
+                yield files[0], sum(f.size for f in files)
+            chain = chain.agg(
+                my_agg,
+                params=("file",),
+                output={"file": File, "total": int},
+                partition_by="file",  # Column referring to all sub-columns of File
+            )
+            chain.save("new_dataset")
+            ```
+            Aggregating all rows into a single group (when `partition_by` is not set):
+            ```py
+            chain = chain.agg(
+                total_size=lambda file, size: [sum(size)],
+                output=int,
+                # No partition_by specified - all rows go into one group
+            )
+            chain.save("new_dataset")
+            ```
+            Multiple partition columns:
+            ```py
+            chain = chain.agg(
+                total=lambda category, subcategory, amount: [sum(amount)],
+                output=float,
+                partition_by=["category", "subcategory"],
+            )
+            chain.save("new_dataset")
+            ```
         """
-        # Convert string partition_by parameters to Column objects
-        processed_partition_by = partition_by
         if partition_by is not None:
+            # Convert string partition_by parameters to Column objects
             if isinstance(partition_by, (str, Function, ColumnElement)):
                 list_partition_by = [partition_by]
             else:
@@ -818,10 +869,10 @@ class DataChain:
             processed_partition_columns: list[ColumnElement] = []
             for col in list_partition_by:
                 if isinstance(col, str):
-                    col_db_name = ColumnMeta.to_db_name(col)
-                    col_type = self.signals_schema.get_column_type(col_db_name)
-                    column = Column(col_db_name, python_to_sql(col_type))
-                    processed_partition_columns.append(column)
+                    columns = self.signals_schema.db_signals(name=col, as_columns=True)
+                    if not columns:
+                        raise SignalResolvingError([col], "is not found")
+                    processed_partition_columns.extend(cast("list[Column]", columns))
                 elif isinstance(col, Function):
                     column = col.get_column(self.signals_schema)
                     processed_partition_columns.append(column)
@@ -830,6 +881,8 @@ class DataChain:
                     processed_partition_columns.append(col)
             processed_partition_by = processed_partition_columns
+        else:
+            processed_partition_by = []
         udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
         return self._evolve(
@@ -969,7 +1022,7 @@ class DataChain:
         )
     @delta_disabled  # type: ignore[arg-type]
-    def group_by(
+    def group_by(  # noqa: C901, PLR0912
         self,
         *,
         partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
@@ -988,6 +1041,15 @@ class DataChain:
                 partition_by=("file_source", "file_ext"),
             )
             ```
+            Using complex signals:
+            ```py
+            chain = chain.group_by(
+                total_size=func.sum("file.size"),
+                count=func.count(),
+                partition_by="file",  # Uses column name, expands to File's unique keys
+            )
+            ```
         """
         if partition_by is None:
             partition_by = []
@@ -998,20 +1060,61 @@ class DataChain:
         signal_columns: list[Column] = []
         schema_fields: dict[str, DataType] = {}
         keep_columns: list[str] = []
+        partial_fields: list[str] = []  # Track specific fields for partial creation
+        schema_partition_by: list[str] = []
-        # validate partition_by columns and add them to the schema
         for col in partition_by:
             if isinstance(col, str):
-                col_db_name = ColumnMeta.to_db_name(col)
-                col_type = self.signals_schema.get_column_type(col_db_name)
-                column = Column(col_db_name, python_to_sql(col_type))
-                if col not in keep_columns:
-                    keep_columns.append(col)
+                columns = self.signals_schema.db_signals(name=col, as_columns=True)
+                if not columns:
+                    raise SignalResolvingError([col], "is not found")
+                partition_by_columns.extend(cast("list[Column]", columns))
+                # For nested field references (e.g., "nested.level1.name"),
+                # we need to distinguish between:
+                # 1. References to fields within a complex signal (create partials)
+                # 2. Deep nested references that should be flattened
+                if "." in col:
+                    # Split the column reference to analyze it
+                    parts = col.split(".")
+                    parent_signal = parts[0]
+                    parent_type = self.signals_schema.values.get(parent_signal)
+                    if ModelStore.is_partial(parent_type):
+                        if parent_signal not in keep_columns:
+                            keep_columns.append(parent_signal)
+                        partial_fields.append(col)
+                        schema_partition_by.append(col)
+                    else:
+                        # BaseModel or other - add flattened columns directly
+                        for column in cast("list[Column]", columns):
+                            col_type = self.signals_schema.get_column_type(column.name)
+                            schema_fields[column.name] = col_type
+                        schema_partition_by.append(col)
+                else:
+                    # simple signal - but we need to check if it's a complex signal
+                    # complex signal - only include the columns used for partitioning
+                    col_type = self.signals_schema.get_column_type(
+                        col, with_subtree=True
+                    )
+                    if isinstance(col_type, type) and issubclass(col_type, BaseModel):
+                        # Complex signal - add only the partitioning columns
+                        for column in cast("list[Column]", columns):
+                            col_type = self.signals_schema.get_column_type(column.name)
+                            schema_fields[column.name] = col_type
+                        schema_partition_by.append(col)
+                    # Simple signal - keep the entire signal
+                    else:
+                        if col not in keep_columns:
+                            keep_columns.append(col)
+                        schema_partition_by.append(col)
             elif isinstance(col, Function):
                 column = col.get_column(self.signals_schema)
                 col_db_name = column.name
                 col_type = column.type.python_type
                 schema_fields[col_db_name] = col_type
+                partition_by_columns.append(column)
+                signal_columns.append(column)
             else:
                 raise DataChainColumnError(
                     col,
@@ -1020,9 +1123,7 @@ class DataChain:
                         " but expected str or Function"
                     ),
                 )
-            partition_by_columns.append(column)
-        # validate signal columns and add them to the schema
         if not kwargs:
             raise ValueError("At least one column should be provided for group_by")
         for col_name, func in kwargs.items():
@@ -1035,9 +1136,9 @@ class DataChain:
             signal_columns.append(column)
             schema_fields[col_name] = func.get_result_type(self.signals_schema)
-        signal_schema = SignalSchema(schema_fields)
-        if keep_columns:
-            signal_schema |= self.signals_schema.to_partial(*keep_columns)
+        signal_schema = self.signals_schema.group_by(
+            schema_partition_by, signal_columns
+        )
         return self._evolve(
             query=self._query.group_by(signal_columns, partition_by_columns),
@@ -1166,6 +1267,7 @@ class DataChain:
         db_signals = self._effective_signals_schema.db_signals(
             include_hidden=include_hidden
         )
         with self._query.ordered_select(*db_signals).as_iterable() as rows:
             if row_factory:
                 rows = (row_factory(db_signals, r) for r in rows)  # type: ignore[assignment]

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -376,7 +376,7 @@ def move_dataset(
             the namespace and project, or a regular name. If a regular name is used,
             default values will be applied. The source dataset will no longer exist
             after the move.
-        dst: The destination dataset name. This can also be a fully qualified
+        dest: The destination dataset name. This can also be a fully qualified
             name with a namespace and project, or just a regular name (default values
             will be used in that case). The original dataset will be moved here.
         session: An optional session instance. If not provided, the default session

datachain/lib/file.py CHANGED Viewed

@@ -43,7 +43,7 @@ logger = logging.getLogger("datachain")
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
-FileType = Literal["binary", "text", "image", "video"]
+FileType = Literal["binary", "text", "image", "video", "audio"]
 EXPORT_FILES_MAX_THREADS = 5
@@ -312,6 +312,14 @@ class File(DataModel):
         file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
         return file
+    def as_audio_file(self) -> "AudioFile":
+        """Convert the file to a `AudioFile` object."""
+        if isinstance(self, AudioFile):
+            return self
+        file = AudioFile(**self.model_dump())
+        file._set_stream(self._catalog, caching_enabled=self._caching_enabled)
+        return file
     @classmethod
     def upload(
         cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
@@ -851,6 +859,157 @@ class VideoFile(File):
             start += duration
+class AudioFile(File):
+    """
+    A data model for handling audio files.
+    This model inherits from the `File` model and provides additional functionality
+    for reading audio files, extracting audio fragments, and splitting audio into
+    fragments.
+    """
+    def get_info(self) -> "Audio":
+        """
+        Retrieves metadata and information about the audio file. It does not
+        download the file if possible, only reads its header. It is thus might be
+        a good idea to disable caching and prefetching for UDF if you only need
+        audio metadata.
+        Returns:
+            Audio: A Model containing audio metadata such as duration,
+                   sample rate, channels, and codec details.
+        """
+        from .audio import audio_info
+        return audio_info(self)
+    def get_fragment(self, start: float, end: float) -> "AudioFragment":
+        """
+        Returns an audio fragment from the specified time range. It does not
+        download the file, neither it actually extracts the fragment. It returns
+        a Model representing the audio fragment, which can be used to read or save
+        it later.
+        Args:
+            start (float): The start time of the fragment in seconds.
+            end (float): The end time of the fragment in seconds.
+        Returns:
+            AudioFragment: A Model representing the audio fragment.
+        """
+        if start < 0 or end < 0 or start >= end:
+            raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+        return AudioFragment(audio=self, start=start, end=end)
+    def get_fragments(
+        self,
+        duration: float,
+        start: float = 0,
+        end: Optional[float] = None,
+    ) -> "Iterator[AudioFragment]":
+        """
+        Splits the audio into multiple fragments of a specified duration.
+        Args:
+            duration (float): The duration of each audio fragment in seconds.
+            start (float): The starting time in seconds (default: 0).
+            end (float, optional): The ending time in seconds. If None, the entire
+                                   remaining audio is processed (default: None).
+        Returns:
+            Iterator[AudioFragment]: An iterator yielding audio fragments.
+        Note:
+            If end is not specified, number of samples will be taken from the
+            audio file, this means audio file needs to be downloaded.
+        """
+        if duration <= 0:
+            raise ValueError("duration must be a positive float")
+        if start < 0:
+            raise ValueError("start must be a non-negative float")
+        if end is None:
+            end = self.get_info().duration
+        if end < 0:
+            raise ValueError("end must be a non-negative float")
+        if start >= end:
+            raise ValueError("start must be less than end")
+        while start < end:
+            yield self.get_fragment(start, min(start + duration, end))
+            start += duration
+class AudioFragment(DataModel):
+    """
+    A data model for representing an audio fragment.
+    This model represents a specific fragment within an audio file with defined
+    start and end times. It allows access to individual fragments and provides
+    functionality for reading and saving audio fragments as separate audio files.
+    Attributes:
+        audio (AudioFile): The audio file containing the audio fragment.
+        start (float): The starting time of the audio fragment in seconds.
+        end (float): The ending time of the audio fragment in seconds.
+    """
+    audio: AudioFile
+    start: float
+    end: float
+    def get_np(self) -> tuple["ndarray", int]:
+        """
+        Returns the audio fragment as a NumPy array with sample rate.
+        Returns:
+            tuple[ndarray, int]: A tuple containing the audio data as a NumPy array
+                               and the sample rate.
+        """
+        from .audio import audio_fragment_np
+        duration = self.end - self.start
+        return audio_fragment_np(self.audio, self.start, duration)
+    def read_bytes(self, format: str = "wav") -> bytes:
+        """
+        Returns the audio fragment as audio bytes.
+        Args:
+            format (str): The desired audio format (e.g., 'wav', 'mp3').
+                         Defaults to 'wav'.
+        Returns:
+            bytes: The encoded audio fragment as bytes.
+        """
+        from .audio import audio_fragment_bytes
+        duration = self.end - self.start
+        return audio_fragment_bytes(self.audio, self.start, duration, format)
+    def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
+        """
+        Saves the audio fragment as a new audio file.
+        If `output` is a remote path, the audio file will be uploaded to remote storage.
+        Args:
+            output (str): The destination path, which can be a local file path
+                          or a remote URL.
+            format (str, optional): The output audio format (e.g., 'wav', 'mp3').
+                                    If None, the format is inferred from the
+                                    file extension.
+        Returns:
+            AudioFile: A Model representing the saved audio file.
+        """
+        from .audio import save_audio_fragment
+        return save_audio_fragment(self.audio, self.start, self.end, output, format)
 class VideoFrame(DataModel):
     """
     A data model for representing a video frame.
@@ -981,6 +1140,34 @@ class Video(DataModel):
     codec: str = Field(default="")
+class Audio(DataModel):
+    """
+    A data model representing metadata for an audio file.
+    Attributes:
+        sample_rate (int): The sample rate of the audio (samples per second).
+                          Defaults to -1 if unknown.
+        channels (int): The number of audio channels. Defaults to -1 if unknown.
+        duration (float): The total duration of the audio in seconds.
+                         Defaults to -1.0 if unknown.
+        samples (int): The total number of samples in the audio.
+                      Defaults to -1 if unknown.
+        format (str): The format of the audio file (e.g., 'wav', 'mp3').
+                     Defaults to an empty string.
+        codec (str): The codec used for encoding the audio. Defaults to an empty string.
+        bit_rate (int): The bit rate of the audio in bits per second.
+                       Defaults to -1 if unknown.
+    """
+    sample_rate: int = Field(default=-1)
+    channels: int = Field(default=-1)
+    duration: float = Field(default=-1.0)
+    samples: int = Field(default=-1)
+    format: str = Field(default="")
+    codec: str = Field(default="")
+    bit_rate: int = Field(default=-1)
 class ArrowRow(DataModel):
     """`DataModel` for reading row from Arrow-supported file."""
@@ -1018,5 +1205,7 @@ def get_file_type(type_: FileType = "binary") -> type[File]:
         file = ImageFile  # type: ignore[assignment]
     elif type_ == "video":
         file = VideoFile
+    elif type_ == "audio":
+        file = AudioFile
     return file

datachain/lib/model_store.py CHANGED Viewed

@@ -81,3 +81,11 @@ class ModelStore:
         if val is None or not ModelStore.is_pydantic(val):
             return None
         return val
+    @staticmethod
+    def is_partial(parent_type) -> bool:
+        return (
+            parent_type
+            and ModelStore.is_pydantic(parent_type)
+            and "@" in ModelStore.get_name(parent_type)
+        )

datachain/lib/signal_schema.py CHANGED Viewed

@@ -446,14 +446,14 @@ class SignalSchema:
                 res[db_name] = python_to_sql(type_)
         return res
-    def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
+    def row_to_objs(self, row: Sequence[Any]) -> list[Any]:
         self._init_setup_values()
-        objs: list[DataValue] = []
+        objs: list[Any] = []
         pos = 0
         for name, fr_type in self.values.items():
-            if self.setup_values and (val := self.setup_values.get(name, None)):
-                objs.append(val)
+            if self.setup_values and name in self.setup_values:
+                objs.append(self.setup_values.get(name))
             elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
                 j, pos = unflatten_to_json_pos(fr, row, pos)
                 objs.append(fr(**j))
@@ -589,6 +589,9 @@ class SignalSchema:
         ]
         if name:
+            if "." in name:
+                name = name.replace(".", "__")
             signals = [
                 s
                 for s in signals
@@ -625,6 +628,15 @@ class SignalSchema:
         return curr_type
+    def group_by(
+        self, partition_by: Sequence[str], new_column: Sequence[Column]
+    ) -> "SignalSchema":
+        orig_schema = SignalSchema(copy.deepcopy(self.values))
+        schema = orig_schema.to_partial(*partition_by)
+        vals = {c.name: sql_to_python(c) for c in new_column}
+        return SignalSchema(schema.values | vals)
     def select_except_signals(self, *args: str) -> "SignalSchema":
         def has_signal(signal: str):
             signal = signal.replace(".", DEFAULT_DELIMITER)
@@ -888,7 +900,7 @@ class SignalSchema:
         return res
-    def to_partial(self, *columns: str) -> "SignalSchema":
+    def to_partial(self, *columns: str) -> "SignalSchema":  # noqa: C901
         """
         Convert the schema to a partial schema with only the specified columns.
@@ -931,9 +943,15 @@ class SignalSchema:
         partial_versions: dict[str, int] = {}
         def _type_name_to_partial(signal_name: str, type_name: str) -> str:
-            if "@" not in type_name:
+            # Check if we need to create a partial for this type
+            # Only create partials for custom types that are in the custom_types dict
+            if type_name not in custom_types:
                 return type_name
-            model_name, _ = ModelStore.parse_name_version(type_name)
+            if "@" in type_name:
+                model_name, _ = ModelStore.parse_name_version(type_name)
+            else:
+                model_name = type_name
             if signal_name not in signal_partials:
                 partial_versions.setdefault(model_name, 0)
@@ -957,6 +975,14 @@ class SignalSchema:
                     parent_type_partial = _type_name_to_partial(signal, parent_type)
                     schema[signal] = parent_type_partial
+                    # If this is a complex signal without field specifier (just "file")
+                    # and it's a custom type, include the entire complex signal
+                    if len(column_parts) == 1 and parent_type in custom_types:
+                        # Include the entire complex signal - no need to create partial
+                        schema[signal] = parent_type
+                        continue
                     continue
                 if parent_type not in custom_types:
@@ -971,6 +997,20 @@ class SignalSchema:
                         f"Field {signal} not found in custom type {parent_type}"
                     )
+                # Check if this is the last part and if the column type is a complex
+                is_last_part = i == len(column_parts) - 1
+                is_complex_signal = signal_type in custom_types
+                if is_last_part and is_complex_signal:
+                    schema[column] = signal_type
+                    # Also need to remove the partial schema entry we created for the
+                    # parent since we're promoting the nested complex column to root
+                    parent_signal = column_parts[0]
+                    schema.pop(parent_signal, None)
+                    # Don't create partial types for this case
+                    break
+                # Create partial type for this field
                 partial_type = _type_name_to_partial(
                     ".".join(column_parts[: i + 1]),
                     signal_type,

datachain/lib/udf.py CHANGED Viewed

@@ -13,8 +13,7 @@ from datachain.asyn import AsyncMapper
 from datachain.cache import temporary_cache
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
-from datachain.lib.data_model import DataValue
-from datachain.lib.file import File
+from datachain.lib.file import DataModel, File
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
 from datachain.query.batch import (
     Batch,
@@ -266,15 +265,28 @@ class UDFBase(AbstractUDF):
     def _parse_row(
         self, row_dict: RowDict, catalog: "Catalog", cache: bool, download_cb: Callback
-    ) -> list[DataValue]:
+    ) -> list[Any]:
         assert self.params
         row = [row_dict[p] for p in self.params.to_udf_spec()]
         obj_row = self.params.row_to_objs(row)
         for obj in obj_row:
-            if isinstance(obj, File):
-                obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
+            self._set_stream_recursive(obj, catalog, cache, download_cb)
         return obj_row
+    def _set_stream_recursive(
+        self, obj: Any, catalog: "Catalog", cache: bool, download_cb: Callback
+    ) -> None:
+        """Recursively set the catalog stream on all File objects within an object."""
+        if isinstance(obj, File):
+            obj._set_stream(catalog, caching_enabled=cache, download_cb=download_cb)
+        # Check all fields for nested File objects, but only for DataModel objects
+        if isinstance(obj, DataModel):
+            for field_name in obj.model_fields:
+                field_value = getattr(obj, field_name, None)
+                if isinstance(field_value, DataModel):
+                    self._set_stream_recursive(field_value, catalog, cache, download_cb)
     def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
         row_dict = RowDict(zip(udf_fields, row))
         return self._parse_row(row_dict, catalog, cache, download_cb)

datachain/query/dataset.py CHANGED Viewed

@@ -1031,16 +1031,22 @@ class SQLGroupBy(SQLClause):
             c.get_column() if isinstance(c, Function) else c for c in self.group_by
         ]
-        cols = [
-            c.get_column()
-            if isinstance(c, Function)
-            else subquery.c[str(c)]
-            if isinstance(c, (str, C))
-            else c
-            for c in (*group_by, *self.cols)
-        ]
+        cols_dict: dict[str, Any] = {}
+        for c in (*group_by, *self.cols):
+            if isinstance(c, Function):
+                key = c.name
+                value = c.get_column()
+            elif isinstance(c, (str, C)):
+                key = str(c)
+                value = subquery.c[str(c)]
+            else:
+                key = c.name
+                value = c  # type: ignore[assignment]
+            cols_dict[key] = value
+        unique_cols = cols_dict.values()
-        return sqlalchemy.select(*cols).select_from(subquery).group_by(*group_by)
+        return sqlalchemy.select(*unique_cols).select_from(subquery).group_by(*group_by)
 def _validate_columns(

{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.25.1
+Version: 0.26.0
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0
@@ -63,6 +63,9 @@ Provides-Extra: torch
 Requires-Dist: torch>=2.1.0; extra == "torch"
 Requires-Dist: torchvision; extra == "torch"
 Requires-Dist: transformers>=4.36.0; extra == "torch"
+Provides-Extra: audio
+Requires-Dist: torchaudio; extra == "audio"
+Requires-Dist: soundfile; extra == "audio"
 Provides-Extra: remote
 Requires-Dist: lz4; extra == "remote"
 Requires-Dist: requests>=2.22.0; extra == "remote"
@@ -78,7 +81,7 @@ Requires-Dist: ffmpeg-python; extra == "video"
 Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
 Requires-Dist: opencv-python; extra == "video"
 Provides-Extra: tests
-Requires-Dist: datachain[hf,remote,torch,vector,video]; extra == "tests"
+Requires-Dist: datachain[audio,hf,remote,torch,vector,video]; extra == "tests"
 Requires-Dist: pytest<9,>=8; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -108,6 +111,7 @@ Requires-Dist: accelerate; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: ultralytics; extra == "examples"
 Requires-Dist: open_clip_torch; extra == "examples"
+Requires-Dist: openai; extra == "examples"
 Dynamic: license-file
 ================

{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datachain/__init__.py,sha256=ofXacfzLKYzTqU1oyHz5xZi1L4skQCoJdUMC4YARenk,1616
+datachain/__init__.py,sha256=2TZ8ptSB9BtnYF31mDEhWG9N16EQ5pf9vNqQaFr2txs,1712
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
 datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
@@ -71,24 +71,25 @@ datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
 datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
+datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
 datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
-datachain/lib/file.py,sha256=gTzJXaGIyFOrw_B4yiOEs7U23n4oAQuWDI2v9KWwp2o,33889
+datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
 datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
 datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
 datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
 datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
 datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
-datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
+datachain/lib/model_store.py,sha256=dkL2rcT5ag-kbgkhQPL_byEs-TCYr29qvdltroL5NxM,2734
 datachain/lib/namespaces.py,sha256=it52UbbwB8dzhesO2pMs_nThXiPQ1Ph9sD9I3GQkg5s,2099
 datachain/lib/projects.py,sha256=8lN0qV8czX1LGtWURCUvRlSJk-RpO9w9Rra_pOZus6g,2595
 datachain/lib/pytorch.py,sha256=oBBd6cxYrcwaFz7IQajKqhGqDdNnwUZWs0wJPRizrjk,7712
 datachain/lib/settings.py,sha256=9wi0FoHxRxNiyn99pR28IYsMkoo47jQxeXuObQr2Ar0,2929
-datachain/lib/signal_schema.py,sha256=dVEqqrQQ_BS3yzU_49-Gari7IjVyMl1UT8h1WIsZabs,36489
+datachain/lib/signal_schema.py,sha256=UGbjG6yJKIU2i4B6z9AK1rqaPWtxRjsPnCV6GYbNqGg,38329
 datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
-datachain/lib/udf.py,sha256=3uITkhO8IZnX49aePheObzd5ORYi2DIDYZVMQlBAJ-s,16687
+datachain/lib/udf.py,sha256=nkcB3HNtSteUspwsGmOKyy3mH2F-Sfo6iW64-Ep47-I,17299
 datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
 datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
 datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
@@ -97,14 +98,14 @@ datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0E
 datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1NY4,1505
 datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djMP3t8CHJLo,3188
-datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
+datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
 datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
 datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
 datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
 datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
 datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
-datachain/lib/dc/datachain.py,sha256=_FJnpgNN_b2xz39MsgeS0NTto0hzpcFPbJlaUBLcqTs,87094
-datachain/lib/dc/datasets.py,sha256=eBhcybEeXHcQ_7RweRCh5uJyF5Ym1EEDPmD0YWYDPHw,15097
+datachain/lib/dc/datachain.py,sha256=YJYHp94yTWjd_ZmBXEUOHVeEvOb5jOhjIxgtqu1dnW4,91746
+datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
 datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
 datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
 datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
@@ -125,7 +126,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
 datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
-datachain/query/dataset.py,sha256=t9EWZkJGPRPcBvKOsFO7ZiaTeUXc8YuTZydRbcv83_E,61350
+datachain/query/dataset.py,sha256=bhJpm53tNLQzGECuR1nC1tg2Vd6foq6AKST5h1rb41U,61606
 datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
 datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -157,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.25.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.25.1.dist-info/METADATA,sha256=NaMV5K1wxCrOI7zW8agwmNfDMMkJJgaQ2fNX2PsuHnc,13385
-datachain-0.25.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datachain-0.25.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.25.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.25.1.dist-info/RECORD,,
+datachain-0.26.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.26.0.dist-info/METADATA,sha256=4-DhUSU6ciIc8iUiB4UAh1ZKyFczvN5rHZnvd1x2Y9U,13543
+datachain-0.26.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datachain-0.26.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.26.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.26.0.dist-info/RECORD,,

{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.25.1.dist-info → datachain-0.26.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.25.1__py3-none-any.whl → 0.26.0__py3-none-any.whl

Potentially problematic release.

datachain 0.25.1py3-none-any.whl → 0.26.0py3-none-any.whl