PyPI - torchrir - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

torchrir 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

torchrir/__init__.py +16 -1
torchrir/animation.py +17 -14
torchrir/core.py +176 -35
torchrir/datasets/__init__.py +9 -3
torchrir/datasets/base.py +43 -3
torchrir/datasets/cmu_arctic.py +9 -20
torchrir/datasets/collate.py +90 -0
torchrir/datasets/librispeech.py +175 -0
torchrir/datasets/template.py +3 -1
torchrir/datasets/utils.py +23 -1
torchrir/dynamic.py +3 -1
torchrir/plotting.py +13 -6
torchrir/plotting_utils.py +4 -1
torchrir/room.py +2 -38
torchrir/scene_utils.py +6 -2
torchrir/signal.py +24 -10
torchrir/simulators.py +12 -4
torchrir/utils.py +1 -1
torchrir-0.2.0.dist-info/METADATA +70 -0
torchrir-0.2.0.dist-info/RECORD +30 -0
torchrir-0.1.2.dist-info/METADATA +0 -271
torchrir-0.1.2.dist-info/RECORD +0 -28
{torchrir-0.1.2.dist-info → torchrir-0.2.0.dist-info}/WHEEL +0 -0
{torchrir-0.1.2.dist-info → torchrir-0.2.0.dist-info}/licenses/LICENSE +0 -0
{torchrir-0.1.2.dist-info → torchrir-0.2.0.dist-info}/licenses/NOTICE +0 -0
{torchrir-0.1.2.dist-info → torchrir-0.2.0.dist-info}/top_level.txt +0 -0

torchrir/__init__.py CHANGED Viewed

@@ -18,6 +18,11 @@ from .datasets import (
     CmuArcticDataset,
     CmuArcticSentence,
     choose_speakers,
+    CollateBatch,
+    collate_dataset_items,
+    DatasetItem,
+    LibriSpeechDataset,
+    LibriSpeechSentence,
     list_cmu_arctic_speakers,
     SentenceLike,
     load_dataset_sources,
@@ -26,7 +31,12 @@ from .datasets import (
     load_wav_mono,
     save_wav,
 )
-from .scene_utils import binaural_mic_positions, clamp_positions, linear_trajectory, sample_positions
+from .scene_utils import (
+    binaural_mic_positions,
+    clamp_positions,
+    linear_trajectory,
+    sample_positions,
+)
 from .utils import (
     att2t_SabineEstimation,
     att2t_sabine_estimation,
@@ -56,6 +66,11 @@ __all__ = [
     "CmuArcticDataset",
     "CmuArcticSentence",
     "choose_speakers",
+    "CollateBatch",
+    "collate_dataset_items",
+    "DatasetItem",
+    "LibriSpeechDataset",
+    "LibriSpeechSentence",
     "DynamicConvolver",
     "estimate_beta_from_t60",
     "estimate_t60_from_beta",

torchrir/animation.py CHANGED Viewed

@@ -104,15 +104,15 @@ def animate_scene_gif(
     mic_lines = []
     for _ in range(view_src_traj.shape[1]):
         if view_dim == 2:
-            line, = ax.plot([], [], color="tab:green", alpha=0.6)
+            (line,) = ax.plot([], [], color="tab:green", alpha=0.6)
         else:
-            line, = ax.plot([], [], [], color="tab:green", alpha=0.6)
+            (line,) = ax.plot([], [], [], color="tab:green", alpha=0.6)
         src_lines.append(line)
     for _ in range(view_mic_traj.shape[1]):
         if view_dim == 2:
-            line, = ax.plot([], [], color="tab:orange", alpha=0.6)
+            (line,) = ax.plot([], [], color="tab:orange", alpha=0.6)
         else:
-            line, = ax.plot([], [], [], color="tab:orange", alpha=0.6)
+            (line,) = ax.plot([], [], [], color="tab:orange", alpha=0.6)
         mic_lines.append(line)
     ax.legend(loc="best")
@@ -137,15 +137,15 @@ def animate_scene_gif(
                 xy = mic_frame[:, m_idx, :]
                 line.set_data(xy[:, 0], xy[:, 1])
         else:
-            src_scatter._offsets3d = (
-                src_pos_frame[:, 0],
-                src_pos_frame[:, 1],
-                src_pos_frame[:, 2],
+            setattr(
+                src_scatter,
+                "_offsets3d",
+                (src_pos_frame[:, 0], src_pos_frame[:, 1], src_pos_frame[:, 2]),
             )
-            mic_scatter._offsets3d = (
-                mic_pos_frame[:, 0],
-                mic_pos_frame[:, 1],
-                mic_pos_frame[:, 2],
+            setattr(
+                mic_scatter,
+                "_offsets3d",
+                (mic_pos_frame[:, 0], mic_pos_frame[:, 1], mic_pos_frame[:, 2]),
             )
             for s_idx, line in enumerate(src_lines):
                 xyz = src_frame[:, s_idx, :]
@@ -166,7 +166,10 @@ def animate_scene_gif(
             fps = frames / duration_s
         else:
             fps = 6.0
-    anim = animation.FuncAnimation(fig, _frame, frames=frames, interval=1000 / fps, blit=False)
-    anim.save(out_path, writer="pillow", fps=fps)
+    anim = animation.FuncAnimation(
+        fig, _frame, frames=frames, interval=1000 / fps, blit=False
+    )
+    fps_int = None if fps is None else max(1, int(round(fps)))
+    anim.save(out_path, writer="pillow", fps=fps_int)
     plt.close(fig)
     return out_path

torchrir/core.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 """Core RIR simulation functions (static and dynamic)."""
 import math
+from collections.abc import Callable
 from typing import Optional, Tuple
 import torch
@@ -61,8 +62,8 @@ def simulate_rir(
     Example:
         >>> room = Room.shoebox(size=[6.0, 4.0, 3.0], fs=16000, beta=[0.9] * 6)
-        >>> sources = Source.positions([[1.0, 2.0, 1.5]])
-        >>> mics = MicrophoneArray.positions([[2.0, 2.0, 1.5]])
+        >>> sources = Source.from_positions([[1.0, 2.0, 1.5]])
+        >>> mics = MicrophoneArray.from_positions([[2.0, 2.0, 1.5]])
         >>> rir = simulate_rir(
         ...     room=room,
         ...     sources=sources,
@@ -90,9 +91,9 @@ def simulate_rir(
     if not isinstance(room, Room):
         raise TypeError("room must be a Room instance")
-    if nsample is None and tmax is None:
-        raise ValueError("nsample or tmax must be provided")
     if nsample is None:
+        if tmax is None:
+            raise ValueError("nsample or tmax must be provided")
         nsample = int(math.ceil(tmax * room.fs))
     if nsample <= 0:
         raise ValueError("nsample must be positive")
@@ -261,6 +262,11 @@ def simulate_dynamic_rir(
     src_traj = as_tensor(src_traj, device=device, dtype=dtype)
     mic_traj = as_tensor(mic_traj, device=device, dtype=dtype)
+    device, dtype = infer_device_dtype(
+        src_traj, mic_traj, room.size, device=device, dtype=dtype
+    )
+    src_traj = as_tensor(src_traj, device=device, dtype=dtype)
+    mic_traj = as_tensor(mic_traj, device=device, dtype=dtype)
     if src_traj.ndim == 2:
         src_traj = src_traj.unsqueeze(1)
@@ -273,24 +279,95 @@ def simulate_dynamic_rir(
     if src_traj.shape[0] != mic_traj.shape[0]:
         raise ValueError("src_traj and mic_traj must have the same time length")
-    t_steps = src_traj.shape[0]
-    rirs = []
-    for t_idx in range(t_steps):
-        rir = simulate_rir(
-            room=room,
-            sources=src_traj[t_idx],
-            mics=mic_traj[t_idx],
-            max_order=max_order,
-            nsample=nsample,
-            tmax=tmax,
-            directivity=directivity,
-            orientation=orientation,
-            config=config,
-            device=device,
-            dtype=dtype,
+    if not isinstance(room, Room):
+        raise TypeError("room must be a Room instance")
+    if nsample is None:
+        if tmax is None:
+            raise ValueError("nsample or tmax must be provided")
+        nsample = int(math.ceil(tmax * room.fs))
+    if nsample <= 0:
+        raise ValueError("nsample must be positive")
+    if max_order < 0:
+        raise ValueError("max_order must be non-negative")
+    room_size = as_tensor(room.size, device=device, dtype=dtype)
+    room_size = ensure_dim(room_size)
+    dim = room_size.numel()
+    if src_traj.shape[2] != dim:
+        raise ValueError("src_traj must match room dimension")
+    if mic_traj.shape[2] != dim:
+        raise ValueError("mic_traj must match room dimension")
+    src_ori = None
+    mic_ori = None
+    if orientation is not None:
+        if isinstance(orientation, (list, tuple)):
+            if len(orientation) != 2:
+                raise ValueError("orientation tuple must have length 2")
+            src_ori, mic_ori = orientation
+        else:
+            src_ori = orientation
+            mic_ori = orientation
+    if src_ori is not None:
+        src_ori = as_tensor(src_ori, device=device, dtype=dtype)
+    if mic_ori is not None:
+        mic_ori = as_tensor(mic_ori, device=device, dtype=dtype)
+    beta = _resolve_beta(room, room_size, device=device, dtype=dtype)
+    beta = _validate_beta(beta, dim)
+    n_vec = _image_source_indices(max_order, dim, device=device, nb_img=None)
+    refl = _reflection_coefficients(n_vec, beta)
+    src_pattern, mic_pattern = split_directivity(directivity)
+    mic_dir = None
+    if mic_pattern != "omni":
+        if mic_ori is None:
+            raise ValueError("mic orientation required for non-omni directivity")
+        mic_dir = orientation_to_unit(mic_ori, dim)
+    n_src = src_traj.shape[1]
+    n_mic = mic_traj.shape[1]
+    rirs = torch.zeros((src_traj.shape[0], n_src, n_mic, nsample), device=device, dtype=dtype)
+    fdl = cfg.frac_delay_length
+    fdl2 = (fdl - 1) // 2
+    img_chunk = cfg.image_chunk_size
+    if img_chunk <= 0:
+        img_chunk = n_vec.shape[0]
+    src_dirs = None
+    if src_pattern != "omni":
+        if src_ori is None:
+            raise ValueError("source orientation required for non-omni directivity")
+        src_dirs = orientation_to_unit(src_ori, dim)
+        if src_dirs.ndim == 1:
+            src_dirs = src_dirs.unsqueeze(0).repeat(n_src, 1)
+        if src_dirs.ndim != 2 or src_dirs.shape[0] != n_src:
+            raise ValueError("source orientation must match number of sources")
+    for start in range(0, n_vec.shape[0], img_chunk):
+        end = min(start + img_chunk, n_vec.shape[0])
+        n_vec_chunk = n_vec[start:end]
+        refl_chunk = refl[start:end]
+        sample_chunk, attenuation_chunk = _compute_image_contributions_time_batch(
+            src_traj,
+            mic_traj,
+            room_size,
+            n_vec_chunk,
+            refl_chunk,
+            room,
+            fdl2,
+            src_pattern=src_pattern,
+            mic_pattern=mic_pattern,
+            src_dirs=src_dirs,
+            mic_dir=mic_dir,
         )
-        rirs.append(rir)
-    return torch.stack(rirs, dim=0)
+        t_steps = src_traj.shape[0]
+        sample_flat = sample_chunk.reshape(t_steps * n_src, n_mic, -1)
+        attenuation_flat = attenuation_chunk.reshape(t_steps * n_src, n_mic, -1)
+        rir_flat = rirs.view(t_steps * n_src, n_mic, nsample)
+        _accumulate_rir_batch(rir_flat, sample_flat, attenuation_flat, cfg)
+    return rirs
 def _prepare_entities(
@@ -495,7 +572,11 @@ def _compute_image_contributions_batch(
     if mic_pattern != "omni":
         if mic_dir is None:
             raise ValueError("mic orientation required for non-omni directivity")
-        mic_dir = mic_dir[None, :, None, :] if mic_dir.ndim == 2 else mic_dir.view(1, 1, 1, -1)
+        mic_dir = (
+            mic_dir[None, :, None, :]
+            if mic_dir.ndim == 2
+            else mic_dir.view(1, 1, 1, -1)
+        )
         cos_theta = _cos_between(-vec, mic_dir)
         gain = gain * directivity_gain(mic_pattern, cos_theta)
@@ -503,6 +584,54 @@ def _compute_image_contributions_batch(
     return sample, attenuation
+def _compute_image_contributions_time_batch(
+    src_traj: Tensor,
+    mic_traj: Tensor,
+    room_size: Tensor,
+    n_vec: Tensor,
+    refl: Tensor,
+    room: Room,
+    fdl2: int,
+    *,
+    src_pattern: str,
+    mic_pattern: str,
+    src_dirs: Optional[Tensor],
+    mic_dir: Optional[Tensor],
+) -> Tuple[Tensor, Tensor]:
+    """Compute samples/attenuation for all time steps in batch."""
+    sign = torch.where((n_vec % 2) == 0, 1.0, -1.0).to(dtype=src_traj.dtype)
+    n = torch.floor_divide(n_vec + 1, 2).to(dtype=src_traj.dtype)
+    base = 2.0 * room_size * n
+    img = base[None, None, :, :] + sign[None, None, :, :] * src_traj[:, :, None, :]
+    vec = mic_traj[:, None, :, None, :] - img[:, :, None, :, :]
+    dist = torch.linalg.norm(vec, dim=-1)
+    dist = torch.clamp(dist, min=1e-6)
+    time = dist / room.c
+    time = time + (fdl2 / room.fs)
+    sample = time * room.fs
+    gain = refl.view(1, 1, 1, -1)
+    if src_pattern != "omni":
+        if src_dirs is None:
+            raise ValueError("source orientation required for non-omni directivity")
+        src_dirs_b = src_dirs[None, :, None, None, :]
+        cos_theta = _cos_between(vec, src_dirs_b)
+        gain = gain * directivity_gain(src_pattern, cos_theta)
+    if mic_pattern != "omni":
+        if mic_dir is None:
+            raise ValueError("mic orientation required for non-omni directivity")
+        mic_dir_b = (
+            mic_dir[None, None, :, None, :]
+            if mic_dir.ndim == 2
+            else mic_dir.view(1, 1, 1, 1, -1)
+        )
+        cos_theta = _cos_between(-vec, mic_dir_b)
+        gain = gain * directivity_gain(mic_pattern, cos_theta)
+    attenuation = gain / dist
+    return sample, attenuation
 def _select_orientation(orientation: Tensor, idx: int, count: int, dim: int) -> Tensor:
     """Pick the correct orientation vector for a given entity index."""
     if orientation.ndim == 0:
@@ -542,9 +671,9 @@ def _accumulate_rir(
     if use_lut:
         sinc_lut = _get_sinc_lut(fdl, lut_gran, device=rir.device, dtype=dtype)
-    mic_offsets = (torch.arange(n_mic, device=rir.device, dtype=torch.int64) * nsample).view(
-        n_mic, 1, 1
-    )
+    mic_offsets = (
+        torch.arange(n_mic, device=rir.device, dtype=torch.int64) * nsample
+    ).view(n_mic, 1, 1)
     rir_flat = rir.view(-1)
     chunk_size = cfg.accumulate_chunk_size
@@ -559,7 +688,9 @@ def _accumulate_rir(
             x_off_frac = (1.0 - frac_m) * lut_gran
             lut_gran_off = torch.floor(x_off_frac).to(torch.int64)
             x_off = x_off_frac - lut_gran_off.to(dtype)
-            lut_pos = lut_gran_off[..., None] + (n[None, None, :].to(torch.int64) * lut_gran)
+            lut_pos = lut_gran_off[..., None] + (
+                n[None, None, :].to(torch.int64) * lut_gran
+            )
             s0 = torch.take(sinc_lut, lut_pos)
             s1 = torch.take(sinc_lut, lut_pos + 1)
@@ -618,9 +749,9 @@ def _accumulate_rir_batch_impl(
     if use_lut:
         sinc_lut = _get_sinc_lut(fdl, lut_gran, device=rir.device, dtype=sample.dtype)
-    sm_offsets = (torch.arange(n_sm, device=rir.device, dtype=torch.int64) * nsample).view(
-        n_sm, 1, 1
-    )
+    sm_offsets = (
+        torch.arange(n_sm, device=rir.device, dtype=torch.int64) * nsample
+    ).view(n_sm, 1, 1)
     rir_flat = rir.view(-1)
     n_img = idx0.shape[1]
@@ -634,7 +765,9 @@ def _accumulate_rir_batch_impl(
             x_off_frac = (1.0 - frac_m) * lut_gran
             lut_gran_off = torch.floor(x_off_frac).to(torch.int64)
             x_off = x_off_frac - lut_gran_off.to(sample.dtype)
-            lut_pos = lut_gran_off[..., None] + (n[None, None, :].to(torch.int64) * lut_gran)
+            lut_pos = lut_gran_off[..., None] + (
+                n[None, None, :].to(torch.int64) * lut_gran
+            )
             s0 = torch.take(sinc_lut, lut_pos)
             s1 = torch.take(sinc_lut, lut_pos + 1)
@@ -660,12 +793,13 @@ _SINC_LUT_CACHE: dict[tuple[int, int, str, torch.dtype], Tensor] = {}
 _FDL_GRID_CACHE: dict[tuple[int, str, torch.dtype], Tensor] = {}
 _FDL_OFFSETS_CACHE: dict[tuple[int, str], Tensor] = {}
 _FDL_WINDOW_CACHE: dict[tuple[int, str, torch.dtype], Tensor] = {}
-_ACCUM_BATCH_COMPILED: dict[tuple[str, torch.dtype, int, int, bool, int], callable] = {}
+_AccumFn = Callable[[Tensor, Tensor, Tensor], None]
+_ACCUM_BATCH_COMPILED: dict[tuple[str, torch.dtype, int, int, bool, int], _AccumFn] = {}
 def _get_accumulate_fn(
     cfg: SimulationConfig, device: torch.device, dtype: torch.dtype
-) -> callable:
+) -> _AccumFn:
     """Return an accumulation function with config-bound constants."""
     use_lut = cfg.use_lut and device.type != "mps"
     fdl = cfg.frac_delay_length
@@ -721,7 +855,9 @@ def _get_fdl_window(fdl: int, *, device: torch.device, dtype: torch.dtype) -> Te
     return cached
-def _get_sinc_lut(fdl: int, lut_gran: int, *, device: torch.device, dtype: torch.dtype) -> Tensor:
+def _get_sinc_lut(
+    fdl: int, lut_gran: int, *, device: torch.device, dtype: torch.dtype
+) -> Tensor:
     """Create a sinc lookup table for fractional delays."""
     key = (fdl, lut_gran, str(device), dtype)
     cached = _SINC_LUT_CACHE.get(key)
@@ -765,7 +901,12 @@ def _apply_diffuse_tail(
     gen = torch.Generator(device=rir.device)
     gen.manual_seed(0 if seed is None else seed)
-    noise = torch.randn(rir[..., tdiff_idx:].shape, device=rir.device, dtype=rir.dtype, generator=gen)
-    scale = torch.linalg.norm(rir[..., tdiff_idx - 1 : tdiff_idx], dim=-1, keepdim=True) + 1e-8
+    noise = torch.randn(
+        rir[..., tdiff_idx:].shape, device=rir.device, dtype=rir.dtype, generator=gen
+    )
+    scale = (
+        torch.linalg.norm(rir[..., tdiff_idx - 1 : tdiff_idx], dim=-1, keepdim=True)
+        + 1e-8
+    )
     rir[..., tdiff_idx:] = noise * decay * scale
     return rir

torchrir/datasets/__init__.py CHANGED Viewed

@@ -1,14 +1,15 @@
 """Dataset helpers for torchrir."""
-from .base import BaseDataset, SentenceLike
-from .utils import choose_speakers, load_dataset_sources
+from .base import BaseDataset, DatasetItem, SentenceLike
+from .utils import choose_speakers, load_dataset_sources, load_wav_mono
+from .collate import CollateBatch, collate_dataset_items
 from .template import TemplateDataset, TemplateSentence
+from .librispeech import LibriSpeechDataset, LibriSpeechSentence
 from .cmu_arctic import (
     CmuArcticDataset,
     CmuArcticSentence,
     list_cmu_arctic_speakers,
-    load_wav_mono,
     save_wav,
 )
@@ -17,6 +18,9 @@ __all__ = [
     "CmuArcticDataset",
     "CmuArcticSentence",
     "choose_speakers",
+    "DatasetItem",
+    "CollateBatch",
+    "collate_dataset_items",
     "list_cmu_arctic_speakers",
     "SentenceLike",
     "load_dataset_sources",
@@ -24,4 +28,6 @@ __all__ = [
     "save_wav",
     "TemplateDataset",
     "TemplateSentence",
+    "LibriSpeechDataset",
+    "LibriSpeechSentence",
 ]

torchrir/datasets/base.py CHANGED Viewed

@@ -2,9 +2,11 @@ from __future__ import annotations
 """Dataset protocol definitions."""
-from typing import Protocol, Sequence, Tuple
+from dataclasses import dataclass
+from typing import Optional, Protocol, Sequence, Tuple
 import torch
+from torch.utils.data import Dataset
 class SentenceLike(Protocol):
@@ -14,14 +16,52 @@ class SentenceLike(Protocol):
     text: str
-class BaseDataset(Protocol):
-    """Protocol for datasets used in torchrir examples and tools."""
+@dataclass(frozen=True)
+class DatasetItem:
+    """Dataset item for DataLoader consumption."""
+    audio: torch.Tensor
+    sample_rate: int
+    utterance_id: str
+    text: Optional[str] = None
+    speaker: Optional[str] = None
+class BaseDataset(Dataset[DatasetItem]):
+    """Base dataset class compatible with torch.utils.data.Dataset."""
+    _sentences_cache: Optional[list[SentenceLike]] = None
     def list_speakers(self) -> list[str]:
         """Return available speaker IDs."""
+        raise NotImplementedError
     def available_sentences(self) -> Sequence[SentenceLike]:
         """Return sentence entries that have audio available."""
+        raise NotImplementedError
     def load_wav(self, utterance_id: str) -> Tuple[torch.Tensor, int]:
         """Load audio for an utterance and return (audio, sample_rate)."""
+        raise NotImplementedError
+    def __len__(self) -> int:
+        return len(self._get_sentences())
+    def __getitem__(self, idx: int) -> DatasetItem:
+        sentences = self._get_sentences()
+        sentence = sentences[idx]
+        audio, sample_rate = self.load_wav(sentence.utterance_id)
+        speaker = getattr(self, "speaker", None)
+        text = getattr(sentence, "text", None)
+        return DatasetItem(
+            audio=audio,
+            sample_rate=sample_rate,
+            utterance_id=sentence.utterance_id,
+            text=text,
+            speaker=speaker,
+        )
+    def _get_sentences(self) -> list[SentenceLike]:
+        if self._sentences_cache is None:
+            self._sentences_cache = list(self.available_sentences())
+        return self._sentences_cache

torchrir/datasets/cmu_arctic.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 """CMU ARCTIC dataset helpers."""
+import logging
 import tarfile
 import urllib.request
 from dataclasses import dataclass
@@ -9,7 +10,9 @@ from pathlib import Path
 from typing import List, Tuple
 import torch
-import logging
+from .base import BaseDataset
+from .utils import load_wav_mono
 BASE_URL = "http://www.festvox.org/cmu_arctic/packed"
 VALID_SPEAKERS = {
@@ -44,11 +47,12 @@ def list_cmu_arctic_speakers() -> List[str]:
 @dataclass
 class CmuArcticSentence:
     """Sentence metadata from CMU ARCTIC."""
     utterance_id: str
     text: str
-class CmuArcticDataset:
+class CmuArcticDataset(BaseDataset):
     """CMU ARCTIC dataset loader.
     Example:
@@ -56,7 +60,9 @@ class CmuArcticDataset:
         >>> audio, fs = dataset.load_wav("arctic_a0001")
     """
-    def __init__(self, root: Path, speaker: str = "bdl", download: bool = False) -> None:
+    def __init__(
+        self, root: Path, speaker: str = "bdl", download: bool = False
+    ) -> None:
         """Initialize a CMU ARCTIC dataset handle.
         Args:
@@ -188,23 +194,6 @@ def _parse_text_line(line: str) -> Tuple[str, str]:
     return utterance, text
-def load_wav_mono(path: Path) -> Tuple[torch.Tensor, int]:
-    """Load a wav file and return mono audio and sample rate.
-    Example:
-        >>> audio, fs = load_wav_mono(Path("datasets/cmu_arctic/ARCTIC/.../wav/arctic_a0001.wav"))
-    """
-    import soundfile as sf
-    audio, sample_rate = sf.read(str(path), dtype="float32", always_2d=True)
-    audio_t = torch.from_numpy(audio)
-    if audio_t.shape[1] > 1:
-        audio_t = audio_t.mean(dim=1)
-    else:
-        audio_t = audio_t.squeeze(1)
-    return audio_t, sample_rate
 def save_wav(path: Path, audio: torch.Tensor, sample_rate: int) -> None:
     """Save a mono or multi-channel wav to disk.

torchrir/datasets/collate.py ADDED Viewed

@@ -0,0 +1,90 @@
+from __future__ import annotations
+"""Collate helpers for DataLoader usage."""
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional
+import torch
+from torch import Tensor
+from .base import DatasetItem
+@dataclass(frozen=True)
+class CollateBatch:
+    """Collated batch of dataset items.
+    Attributes:
+        audio: Padded audio tensor of shape (batch, max_len).
+        lengths: Original lengths for each item.
+        sample_rate: Sample rate shared across the batch.
+        utterance_ids: Utterance IDs per item.
+        texts: Optional text per item.
+        speakers: Optional speaker IDs per item.
+        metadata: Optional per-item metadata (pass-through).
+    """
+    audio: Tensor
+    lengths: Tensor
+    sample_rate: int
+    utterance_ids: list[str]
+    texts: list[Optional[str]]
+    speakers: list[Optional[str]]
+    metadata: Optional[list[Any]] = None
+def collate_dataset_items(
+    items: Iterable[DatasetItem],
+    *,
+    pad_value: float = 0.0,
+    keep_metadata: bool = False,
+) -> CollateBatch:
+    """Collate DatasetItem entries into a padded batch.
+    Args:
+        items: Iterable of DatasetItem.
+        pad_value: Value used for padding.
+        keep_metadata: Preserve item-level metadata field if present.
+    Returns:
+        CollateBatch with padded audio and metadata lists.
+    """
+    batch = list(items)
+    if not batch:
+        raise ValueError("collate_dataset_items received an empty batch")
+    sample_rate = batch[0].sample_rate
+    for item in batch[1:]:
+        if item.sample_rate != sample_rate:
+            raise ValueError("sample_rate must be consistent within a batch")
+    lengths = torch.tensor([item.audio.numel() for item in batch], dtype=torch.long)
+    max_len = int(lengths.max().item())
+    audio = torch.full(
+        (len(batch), max_len),
+        pad_value,
+        dtype=batch[0].audio.dtype,
+        device=batch[0].audio.device,
+    )
+    for idx, item in enumerate(batch):
+        audio[idx, : item.audio.numel()] = item.audio
+    utterance_ids = [item.utterance_id for item in batch]
+    texts = [item.text for item in batch]
+    speakers = [item.speaker for item in batch]
+    metadata: Optional[list[Any]] = None
+    if keep_metadata:
+        metadata = [getattr(item, "metadata", None) for item in batch]
+    return CollateBatch(
+        audio=audio,
+        lengths=lengths,
+        sample_rate=sample_rate,
+        utterance_ids=utterance_ids,
+        texts=texts,
+        speakers=speakers,
+        metadata=metadata,
+    )

torchrir 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

torchrir 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl