PyPI - datachain - Versions diffs - 0.25.2__py3-none-any.whl → 0.26.1__py3-none-any.whl - Mend

datachain 0.25.2py3-none-any.whl → 0.26.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (19) hide show

datachain/__init__.py +6 -0
datachain/catalog/loader.py +4 -0
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +34 -0
datachain/lib/audio.py +151 -0
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/dc/datachain.py +227 -67
datachain/lib/file.py +190 -1
datachain/lib/model_store.py +8 -0
datachain/lib/pytorch.py +4 -1
datachain/lib/signal_schema.py +56 -11
datachain/lib/udf.py +17 -5
datachain/query/dataset.py +37 -9
{datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/METADATA +6 -2
{datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/RECORD +19 -18
{datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/WHEEL +0 -0
{datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/entry_points.txt +0 -0
{datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/licenses/LICENSE +0 -0
{datachain-0.25.2.dist-info → datachain-0.26.1.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -21,6 +21,9 @@ from datachain.lib.dc import (
 )
 from datachain.lib.file import (
     ArrowRow,
+    Audio,
+    AudioFile,
+    AudioFragment,
     File,
     FileError,
     Image,
@@ -43,6 +46,9 @@ __all__ = [
     "AbstractUDF",
     "Aggregator",
     "ArrowRow",
+    "Audio",
+    "AudioFile",
+    "AudioFragment",
     "C",
     "Column",
     "DataChain",

datachain/catalog/loader.py CHANGED Viewed

@@ -18,6 +18,7 @@ WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
 WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
 DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
 DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
+DISTRIBUTED_DISABLED = "DATACHAIN_DISTRIBUTED_DISABLED"
 IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
@@ -103,6 +104,9 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
 def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
+    if os.environ.get(DISTRIBUTED_DISABLED) == "True":
+        return None
     if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
         return None

datachain/func/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .aggregate import (
     sum,
 )
 from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
-from .conditional import and_, case, greatest, ifelse, isnone, least, or_
+from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
 from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .path import file_ext, file_stem, name, parent
 from .random import rand
@@ -54,6 +54,7 @@ __all__ = [
     "max",
     "min",
     "name",
+    "not_",
     "or_",
     "parent",
     "path",

datachain/func/conditional.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Optional, Union
 from sqlalchemy import ColumnElement
 from sqlalchemy import and_ as sql_and
 from sqlalchemy import case as sql_case
+from sqlalchemy import not_ as sql_not
 from sqlalchemy import or_ as sql_or
 from datachain.lib.utils import DataChainParamsError
@@ -288,3 +289,36 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
             func_args.append(arg)
     return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
+def not_(arg: Union[ColumnElement, Func]) -> Func:
+    """
+    Returns the function that produces NOT of the given expressions.
+    Args:
+        arg (ColumnElement | Func): The expression for NOT statement.
+            If a string is provided, it is assumed to be the name of the column.
+            If a Column is provided, it is assumed to be a column in the dataset.
+            If a Func is provided, it is assumed to be a function returning a value.
+    Returns:
+        Func: A `Func` object that represents the NOT function.
+    Example:
+        ```py
+        dc.mutate(
+            test=not_(C("value") == 5)
+        )
+        ```
+    Notes:
+        - The result column will always be of type bool.
+    """
+    cols, func_args = [], []
+    if isinstance(arg, (str, Func)):
+        cols.append(arg)
+    else:
+        func_args.append(arg)
+    return Func("not", inner=sql_not, cols=cols, args=func_args, result_type=bool)

datachain/lib/audio.py ADDED Viewed

@@ -0,0 +1,151 @@
+import posixpath
+from typing import TYPE_CHECKING, Optional, Union
+from datachain.lib.file import FileError
+if TYPE_CHECKING:
+    from numpy import ndarray
+    from datachain.lib.file import Audio, AudioFile, File
+try:
+    import torchaudio
+except ImportError as exc:
+    raise ImportError(
+        "Missing dependencies for processing audio.\n"
+        "To install run:\n\n"
+        "  pip install 'datachain[audio]'\n"
+    ) from exc
+def audio_info(file: "Union[File, AudioFile]") -> "Audio":
+    """Extract metadata like sample rate, channels, duration, and format."""
+    from datachain.lib.file import Audio
+    file = file.as_audio_file()
+    try:
+        with file.open() as f:
+            info = torchaudio.info(f)
+            sample_rate = int(info.sample_rate)
+            channels = int(info.num_channels)
+            frames = int(info.num_frames)
+            duration = float(frames / sample_rate) if sample_rate > 0 else 0.0
+            # Get format information
+            format_name = getattr(info, "format", "")
+            codec_name = getattr(info, "encoding", "")
+            bit_rate = getattr(info, "bits_per_sample", 0) * sample_rate * channels
+    except Exception as exc:
+        raise FileError(
+            "unable to extract metadata from audio file", file.source, file.path
+        ) from exc
+    return Audio(
+        sample_rate=sample_rate,
+        channels=channels,
+        duration=duration,
+        samples=frames,
+        format=format_name,
+        codec=codec_name,
+        bit_rate=bit_rate,
+    )
+def audio_fragment_np(
+    audio: "AudioFile", start: float = 0, duration: Optional[float] = None
+) -> "tuple[ndarray, int]":
+    """Load audio fragment as numpy array.
+    Multi-channel audio is transposed to (samples, channels)."""
+    if start < 0:
+        raise ValueError("start must be a non-negative float")
+    if duration is not None and duration <= 0:
+        raise ValueError("duration must be a positive float")
+    if hasattr(audio, "as_audio_file"):
+        audio = audio.as_audio_file()
+    try:
+        with audio.open() as f:
+            info = torchaudio.info(f)
+            sample_rate = info.sample_rate
+            frame_offset = int(start * sample_rate)
+            num_frames = int(duration * sample_rate) if duration is not None else -1
+            # Reset file pointer to the beginning
+            # This is important to ensure we read from the correct position later
+            f.seek(0)
+            waveform, sr = torchaudio.load(
+                f, frame_offset=frame_offset, num_frames=num_frames
+            )
+            audio_np = waveform.numpy()
+            if audio_np.shape[0] > 1:
+                audio_np = audio_np.T
+            else:
+                audio_np = audio_np.squeeze()
+            return audio_np, int(sr)
+    except Exception as exc:
+        raise FileError(
+            "unable to read audio fragment", audio.source, audio.path
+        ) from exc
+def audio_fragment_bytes(
+    audio: "AudioFile",
+    start: float = 0,
+    duration: Optional[float] = None,
+    format: str = "wav",
+) -> bytes:
+    """Convert audio fragment to bytes using soundfile."""
+    y, sr = audio_fragment_np(audio, start, duration)
+    import io
+    import soundfile as sf
+    buffer = io.BytesIO()
+    sf.write(buffer, y, sr, format=format)
+    return buffer.getvalue()
+def save_audio_fragment(
+    audio: "AudioFile",
+    start: float,
+    end: float,
+    output: str,
+    format: Optional[str] = None,
+) -> "AudioFile":
+    """Save audio fragment with timestamped filename.
+    Supports local and remote storage upload."""
+    if start < 0 or end < 0 or start >= end:
+        raise ValueError(f"Invalid time range: ({start:.3f}, {end:.3f})")
+    if format is None:
+        format = audio.get_file_ext()
+    duration = end - start
+    start_ms = int(start * 1000)
+    end_ms = int(end * 1000)
+    output_file = posixpath.join(
+        output, f"{audio.get_file_stem()}_{start_ms:06d}_{end_ms:06d}.{format}"
+    )
+    try:
+        audio_bytes = audio_fragment_bytes(audio, start, duration, format)
+        from datachain.lib.file import AudioFile
+        return AudioFile.upload(audio_bytes, output_file, catalog=audio._catalog)
+    except Exception as exc:
+        raise FileError(
+            "unable to save audio fragment", audio.source, audio.path
+        ) from exc

datachain/lib/convert/sql_to_python.py CHANGED Viewed

@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
         type_ = sql_exp.type.python_type
         if type_ == Decimal:
             type_ = float
+        elif type_ is list:
+            if hasattr(sql_exp.type, "item_type") and hasattr(
+                sql_exp.type.item_type, "python_type"
+            ):
+                item_type = getattr(sql_exp.type.item_type, "python_type", Any)
+                type_ = list[item_type]  # type: ignore[valid-type]
+            else:
+                type_ = list
     except NotImplementedError:
         type_ = str
     return type_

datachain 0.25.2__py3-none-any.whl → 0.26.1__py3-none-any.whl

Potentially problematic release.

datachain 0.25.2py3-none-any.whl → 0.26.1py3-none-any.whl