PyPI - torchaudio - Versions diffs - 2.0.2__cp311-cp311-manylinux1_x86_64.whl → 2.1.1__cp311-cp311-manylinux1_x86_64.whl - Mend

torchaudio 2.0.2__cp311-cp311-manylinux1_x86_64.whl → 2.1.1__cp311-cp311-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchaudio might be problematic. Click here for more details.

Files changed (92) hide show

torchaudio/__init__.py +22 -3
torchaudio/_backend/__init__.py +55 -4
torchaudio/_backend/backend.py +53 -0
torchaudio/_backend/common.py +52 -0
torchaudio/_backend/ffmpeg.py +373 -0
torchaudio/_backend/soundfile.py +54 -0
torchaudio/_backend/soundfile_backend.py +457 -0
torchaudio/_backend/sox.py +91 -0
torchaudio/_backend/utils.py +81 -323
torchaudio/_extension/__init__.py +55 -36
torchaudio/_extension/utils.py +109 -17
torchaudio/_internal/__init__.py +4 -1
torchaudio/_internal/module_utils.py +37 -6
torchaudio/backend/__init__.py +7 -11
torchaudio/backend/_no_backend.py +24 -0
torchaudio/backend/_sox_io_backend.py +297 -0
torchaudio/backend/common.py +12 -52
torchaudio/backend/no_backend.py +11 -21
torchaudio/backend/soundfile_backend.py +11 -448
torchaudio/backend/sox_io_backend.py +11 -435
torchaudio/backend/utils.py +9 -18
torchaudio/datasets/__init__.py +2 -0
torchaudio/datasets/cmuarctic.py +1 -1
torchaudio/datasets/cmudict.py +61 -62
torchaudio/datasets/dr_vctk.py +1 -1
torchaudio/datasets/gtzan.py +1 -1
torchaudio/datasets/librilight_limited.py +1 -1
torchaudio/datasets/librispeech.py +1 -1
torchaudio/datasets/librispeech_biasing.py +189 -0
torchaudio/datasets/libritts.py +1 -1
torchaudio/datasets/ljspeech.py +1 -1
torchaudio/datasets/musdb_hq.py +1 -1
torchaudio/datasets/quesst14.py +1 -1
torchaudio/datasets/speechcommands.py +1 -1
torchaudio/datasets/tedlium.py +1 -1
torchaudio/datasets/vctk.py +1 -1
torchaudio/datasets/voxceleb1.py +1 -1
torchaudio/datasets/yesno.py +1 -1
torchaudio/functional/__init__.py +6 -2
torchaudio/functional/_alignment.py +128 -0
torchaudio/functional/filtering.py +69 -92
torchaudio/functional/functional.py +99 -148
torchaudio/io/__init__.py +4 -1
torchaudio/io/_effector.py +347 -0
torchaudio/io/_stream_reader.py +158 -90
torchaudio/io/_stream_writer.py +196 -10
torchaudio/lib/_torchaudio.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg4.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg5.so +0 -0
torchaudio/lib/_torchaudio_ffmpeg6.so +0 -0
torchaudio/lib/_torchaudio_sox.so +0 -0
torchaudio/lib/libctc_prefix_decoder.so +0 -0
torchaudio/lib/libtorchaudio.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg4.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg5.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg6.so +0 -0
torchaudio/lib/libtorchaudio_sox.so +0 -0
torchaudio/lib/pybind11_prefixctc.so +0 -0
torchaudio/models/__init__.py +14 -0
torchaudio/models/decoder/__init__.py +22 -7
torchaudio/models/decoder/_ctc_decoder.py +123 -69
torchaudio/models/decoder/_cuda_ctc_decoder.py +187 -0
torchaudio/models/rnnt_decoder.py +10 -14
torchaudio/models/squim/__init__.py +11 -0
torchaudio/models/squim/objective.py +326 -0
torchaudio/models/squim/subjective.py +150 -0
torchaudio/models/wav2vec2/components.py +6 -10
torchaudio/pipelines/__init__.py +9 -0
torchaudio/pipelines/_squim_pipeline.py +176 -0
torchaudio/pipelines/_wav2vec2/aligner.py +87 -0
torchaudio/pipelines/_wav2vec2/impl.py +198 -68
torchaudio/pipelines/_wav2vec2/utils.py +120 -0
torchaudio/sox_effects/sox_effects.py +7 -30
torchaudio/transforms/__init__.py +2 -0
torchaudio/transforms/_transforms.py +99 -54
torchaudio/utils/download.py +2 -2
torchaudio/utils/ffmpeg_utils.py +20 -15
torchaudio/utils/sox_utils.py +8 -9
torchaudio/version.py +2 -2
torchaudio-2.1.1.dist-info/METADATA +113 -0
torchaudio-2.1.1.dist-info/RECORD +119 -0
torchaudio/io/_compat.py +0 -241
torchaudio/lib/_torchaudio_ffmpeg.so +0 -0
torchaudio/lib/flashlight_lib_text_decoder.so +0 -0
torchaudio/lib/flashlight_lib_text_dictionary.so +0 -0
torchaudio/lib/libflashlight-text.so +0 -0
torchaudio/lib/libtorchaudio_ffmpeg.so +0 -0
torchaudio-2.0.2.dist-info/METADATA +0 -26
torchaudio-2.0.2.dist-info/RECORD +0 -100
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/LICENSE +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/WHEEL +0 -0
{torchaudio-2.0.2.dist-info → torchaudio-2.1.1.dist-info}/top_level.txt +0 -0

torchaudio/models/decoder/_cuda_ctc_decoder.py ADDED Viewed

@@ -0,0 +1,187 @@
+from __future__ import annotations
+import math
+from typing import List, NamedTuple, Union
+import torch
+import torchaudio
+torchaudio._extension._load_lib("libctc_prefix_decoder")
+import torchaudio.lib.pybind11_prefixctc as cuctc
+__all__ = ["CUCTCHypothesis", "CUCTCDecoder", "cuda_ctc_decoder"]
+def _get_vocab_list(vocab_file):
+    vocab = []
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip().split()
+            vocab.append(line[0])
+    return vocab
+class CUCTCHypothesis(NamedTuple):
+    r"""Represents hypothesis generated by CUCTC beam search decoder :class:`CUCTCDecoder`."""
+    tokens: List[int]
+    """Predicted sequence of token IDs. Shape `(L, )`, where `L` is the length of the output sequence"""
+    words: List[str]
+    """List of predicted tokens. Algin with modeling unit.
+    """
+    score: float
+    """Score corresponding to hypothesis"""
+_DEFAULT_BLANK_SKIP_THREASHOLD = 0.95
+class CUCTCDecoder:
+    """CUDA CTC beam search decoder.
+    .. devices:: CUDA
+    Note:
+        To build the decoder, please use the factory function :func:`cuda_ctc_decoder`.
+    """
+    def __init__(
+        self,
+        vocab_list: List[str],
+        blank_id: int = 0,
+        beam_size: int = 10,
+        nbest: int = 1,
+        blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
+        cuda_stream: torch.cuda.streams.Stream = None,
+    ):
+        """
+        Args:
+            blank_id (int): token id corresopnding to blank, only support 0 for now. (Default: 0)
+            vocab_list (List[str]): list of vocabulary tokens
+            beam_size (int, optional): max number of hypos to hold after each decode step (Default: 10)
+            nbest (int): number of best decodings to return
+            blank_skip_threshold (float):
+                skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding.
+                (Default: 0.95).
+            cuda_stream (torch.cuda.streams.Stream): using assigned cuda stream (Default: using default stream)
+        """
+        if cuda_stream:
+            if not isinstance(cuda_stream, torch.cuda.streams.Stream):
+                raise AssertionError("cuda_stream must be torch.cuda.streams.Stream")
+        cuda_stream_ = cuda_stream.cuda_stream if cuda_stream else torch.cuda.current_stream().cuda_stream
+        self.internal_data = cuctc.prefixCTC_alloc(cuda_stream_)
+        self.memory = torch.empty(0, dtype=torch.int8, device=torch.device("cuda"))
+        if blank_id != 0:
+            raise AssertionError("blank_id must be 0")
+        self.blank_id = blank_id
+        self.vocab_list = vocab_list
+        self.space_id = 0
+        self.nbest = nbest
+        if not (blank_skip_threshold >= 0 and blank_skip_threshold <= 1):
+            raise AssertionError("blank_skip_threshold must be between 0 and 1")
+        self.blank_skip_threshold = math.log(blank_skip_threshold)
+        self.beam_size = min(beam_size, len(vocab_list))  # beam size must be smaller than vocab size
+    def __del__(self):
+        if cuctc is not None:
+            cuctc.prefixCTC_free(self.internal_data)
+    def __call__(self, log_prob: torch.Tensor, encoder_out_lens: torch.Tensor):
+        """
+        Args:
+            log_prob (torch.FloatTensor): GPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
+                probability distribution over labels; log_softmax(output of acoustic model).
+            lengths (dtype torch.int32): GPU tensor of shape `(batch, )` storing the valid length of
+                in time axis of the output Tensor in each batch.
+        Returns:
+            List[List[CUCTCHypothesis]]:
+                List of sorted best hypotheses for each audio sequence in the batch.
+        """
+        if not encoder_out_lens.dtype == torch.int32:
+            raise AssertionError("encoder_out_lens must be torch.int32")
+        if not log_prob.dtype == torch.float32:
+            raise AssertionError("log_prob must be torch.float32")
+        if not (log_prob.is_cuda and encoder_out_lens.is_cuda):
+            raise AssertionError("inputs must be cuda tensors")
+        if not (log_prob.is_contiguous() and encoder_out_lens.is_contiguous()):
+            raise AssertionError("input tensors must be contiguous")
+        required_size, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
+            self.internal_data,
+            self.memory.data_ptr(),
+            self.memory.size(0),
+            log_prob.data_ptr(),
+            encoder_out_lens.data_ptr(),
+            log_prob.size(),
+            log_prob.stride(),
+            self.beam_size,
+            self.blank_id,
+            self.space_id,
+            self.blank_skip_threshold,
+        )
+        if required_size > 0:
+            self.memory = torch.empty(required_size, dtype=torch.int8, device=log_prob.device).contiguous()
+            _, score_hyps = cuctc.ctc_beam_search_decoder_batch_gpu_v2(
+                self.internal_data,
+                self.memory.data_ptr(),
+                self.memory.size(0),
+                log_prob.data_ptr(),
+                encoder_out_lens.data_ptr(),
+                log_prob.size(),
+                log_prob.stride(),
+                self.beam_size,
+                self.blank_id,
+                self.space_id,
+                self.blank_skip_threshold,
+            )
+        batch_size = len(score_hyps)
+        hypos = []
+        for i in range(batch_size):
+            hypos.append(
+                [
+                    CUCTCHypothesis(
+                        tokens=score_hyps[i][j][1],
+                        words=[self.vocab_list[word_id] for word_id in score_hyps[i][j][1]],
+                        score=score_hyps[i][j][0],
+                    )
+                    for j in range(self.nbest)
+                ]
+            )
+        return hypos
+def cuda_ctc_decoder(
+    tokens: Union[str, List[str]],
+    nbest: int = 1,
+    beam_size: int = 10,
+    blank_skip_threshold: float = _DEFAULT_BLANK_SKIP_THREASHOLD,
+) -> CUCTCDecoder:
+    """Builds an instance of :class:`CUCTCDecoder`.
+    Args:
+        tokens (str or List[str]): File or list containing valid tokens.
+            If using a file, the expected format is for tokens mapping to the same index to be on the same line
+        beam_size (int, optional): The maximum number of hypos to hold after each decode step (Default: 10)
+        nbest (int): The number of best decodings to return
+        blank_id (int): The token ID corresopnding to the blank symbol.
+        blank_skip_threshold (float): skip frames if log_prob(blank) > log(blank_skip_threshold), to speed up decoding
+            (Default: 0.95).
+    Returns:
+        CUCTCDecoder: decoder
+    Example
+        >>> decoder = cuda_ctc_decoder(
+        >>>     vocab_file="tokens.txt",
+        >>>     blank_skip_threshold=0.95,
+        >>> )
+        >>> results = decoder(log_probs, encoder_out_lens) # List of shape (B, nbest) of Hypotheses
+    """
+    if type(tokens) == str:
+        tokens = _get_vocab_list(tokens)
+    return CUCTCDecoder(vocab_list=tokens, beam_size=beam_size, nbest=nbest, blank_skip_threshold=blank_skip_threshold)

torchaudio/models/rnnt_decoder.py CHANGED Viewed

@@ -109,13 +109,9 @@ class RNNTBeamSearch(torch.nn.Module):
         self.step_max_tokens = step_max_tokens
-    def _init_b_hypos(self, hypo: Optional[Hypothesis], device: torch.device) -> List[Hypothesis]:
-        if hypo is not None:
-            token = _get_hypo_tokens(hypo)[-1]
-            state = _get_hypo_state(hypo)
-        else:
-            token = self.blank
-            state = None
+    def _init_b_hypos(self, device: torch.device) -> List[Hypothesis]:
+        token = self.blank
+        state = None
         one_tensor = torch.tensor([1], device=device)
         pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state)
@@ -230,14 +226,14 @@ class RNNTBeamSearch(torch.nn.Module):
     def _search(
         self,
         enc_out: torch.Tensor,
-        hypo: Optional[Hypothesis],
+        hypo: Optional[List[Hypothesis]],
         beam_width: int,
     ) -> List[Hypothesis]:
         n_time_steps = enc_out.shape[1]
         device = enc_out.device
         a_hypos: List[Hypothesis] = []
-        b_hypos = self._init_b_hypos(hypo, device)
+        b_hypos = self._init_b_hypos(device) if hypo is None else hypo
         for t in range(n_time_steps):
             a_hypos = b_hypos
             b_hypos = torch.jit.annotate(List[Hypothesis], [])
@@ -263,7 +259,7 @@ class RNNTBeamSearch(torch.nn.Module):
                 if a_hypos:
                     symbols_current_t += 1
-            _, sorted_idx = torch.tensor([self.hypo_sort_key(hypo) for hypo in b_hypos]).topk(beam_width)
+            _, sorted_idx = torch.tensor([self.hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width)
             b_hypos = [b_hypos[idx] for idx in sorted_idx]
         return b_hypos
@@ -290,8 +286,8 @@ class RNNTBeamSearch(torch.nn.Module):
         if length.shape != () and length.shape != (1,):
             raise ValueError("length must be of shape () or (1,)")
-        if input.dim() == 0:
-            input = input.unsqueeze(0)
+        if length.dim() == 0:
+            length = length.unsqueeze(0)
         enc_out, _ = self.model.transcribe(input, length)
         return self._search(enc_out, None, beam_width)
@@ -303,7 +299,7 @@ class RNNTBeamSearch(torch.nn.Module):
         length: torch.Tensor,
         beam_width: int,
         state: Optional[List[List[torch.Tensor]]] = None,
-        hypothesis: Optional[Hypothesis] = None,
+        hypothesis: Optional[List[Hypothesis]] = None,
     ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]:
         r"""Performs beam search for the given input sequence in streaming mode.
@@ -318,7 +314,7 @@ class RNNTBeamSearch(torch.nn.Module):
             state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                 representing transcription network internal state generated in preceding
                 invocation. (Default: ``None``)
-            hypothesis (Hypothesis or None): hypothesis from preceding invocation to seed
+            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
                 search with. (Default: ``None``)
         Returns:

torchaudio/models/squim/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .objective import squim_objective_base, squim_objective_model, SquimObjective
+from .subjective import squim_subjective_base, squim_subjective_model, SquimSubjective
+__all__ = [
+    "squim_objective_base",
+    "squim_objective_model",
+    "squim_subjective_base",
+    "squim_subjective_model",
+    "SquimObjective",
+    "SquimSubjective",
+]

torchaudio/models/squim/objective.py ADDED Viewed

@@ -0,0 +1,326 @@
+import math
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def transform_wb_pesq_range(x: float) -> float:
+    """The metric defined by ITU-T P.862 is often called 'PESQ score', which is defined
+    for narrow-band signals and has a value range of [-0.5, 4.5] exactly. Here, we use the metric
+    defined by ITU-T P.862.2, commonly known as 'wide-band PESQ' and will be referred to as "PESQ score".
+    Args:
+        x (float): Narrow-band PESQ score.
+    Returns:
+        (float): Wide-band PESQ score.
+    """
+    return 0.999 + (4.999 - 0.999) / (1 + math.exp(-1.3669 * x + 3.8224))
+PESQRange: Tuple[float, float] = (
+    1.0,  # P.862.2 uses a different input filter than P.862, and the lower bound of
+    # the raw score is not -0.5 anymore. It's hard to figure out the true lower bound.
+    # We are using 1.0 as a reasonable approximation.
+    transform_wb_pesq_range(4.5),
+)
+class RangeSigmoid(nn.Module):
+    def __init__(self, val_range: Tuple[float, float] = (0.0, 1.0)) -> None:
+        super(RangeSigmoid, self).__init__()
+        assert isinstance(val_range, tuple) and len(val_range) == 2
+        self.val_range: Tuple[float, float] = val_range
+        self.sigmoid: nn.modules.Module = nn.Sigmoid()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = self.sigmoid(x) * (self.val_range[1] - self.val_range[0]) + self.val_range[0]
+        return out
+class Encoder(nn.Module):
+    """Encoder module that transform 1D waveform to 2D representations.
+    Args:
+        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 512)
+        win_len (int, optional): kernel size in the Conv1D layer. (Default: 32)
+    """
+    def __init__(self, feat_dim: int = 512, win_len: int = 32) -> None:
+        super(Encoder, self).__init__()
+        self.conv1d = nn.Conv1d(1, feat_dim, win_len, stride=win_len // 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply waveforms to convolutional layer and ReLU layer.
+        Args:
+            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.
+        Returns:
+            (torch,Tensor): Feature Tensor with dimensions `(batch, channel, frame)`.
+        """
+        out = x.unsqueeze(dim=1)
+        out = F.relu(self.conv1d(out))
+        return out
+class SingleRNN(nn.Module):
+    def __init__(self, rnn_type: str, input_size: int, hidden_size: int, dropout: float = 0.0) -> None:
+        super(SingleRNN, self).__init__()
+        self.rnn_type = rnn_type
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.rnn: nn.modules.Module = getattr(nn, rnn_type)(
+            input_size,
+            hidden_size,
+            1,
+            dropout=dropout,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.proj = nn.Linear(hidden_size * 2, input_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # input shape: batch, seq, dim
+        out, _ = self.rnn(x)
+        out = self.proj(out)
+        return out
+class DPRNN(nn.Module):
+    """*Dual-path recurrent neural networks (DPRNN)* :cite:`luo2020dual`.
+    Args:
+        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 64)
+        hidden_dim (int, optional): Hidden dimension in the RNN layer of DPRNN. (Default: 128)
+        num_blocks (int, optional): Number of DPRNN layers. (Default: 6)
+        rnn_type (str, optional): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. (Default: "LSTM")
+        d_model (int, optional): The number of expected features in the input. (Default: 256)
+        chunk_size (int, optional): Chunk size of input for DPRNN. (Default: 100)
+        chunk_stride (int, optional): Stride of chunk input for DPRNN. (Default: 50)
+    """
+    def __init__(
+        self,
+        feat_dim: int = 64,
+        hidden_dim: int = 128,
+        num_blocks: int = 6,
+        rnn_type: str = "LSTM",
+        d_model: int = 256,
+        chunk_size: int = 100,
+        chunk_stride: int = 50,
+    ) -> None:
+        super(DPRNN, self).__init__()
+        self.num_blocks = num_blocks
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        self.row_norm = nn.ModuleList([])
+        self.col_norm = nn.ModuleList([])
+        for _ in range(num_blocks):
+            self.row_rnn.append(SingleRNN(rnn_type, feat_dim, hidden_dim))
+            self.col_rnn.append(SingleRNN(rnn_type, feat_dim, hidden_dim))
+            self.row_norm.append(nn.GroupNorm(1, feat_dim, eps=1e-8))
+            self.col_norm.append(nn.GroupNorm(1, feat_dim, eps=1e-8))
+        self.conv = nn.Sequential(
+            nn.Conv2d(feat_dim, d_model, 1),
+            nn.PReLU(),
+        )
+        self.chunk_size = chunk_size
+        self.chunk_stride = chunk_stride
+    def pad_chunk(self, x: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        # input shape: (B, N, T)
+        seq_len = x.shape[-1]
+        rest = self.chunk_size - (self.chunk_stride + seq_len % self.chunk_size) % self.chunk_size
+        out = F.pad(x, [self.chunk_stride, rest + self.chunk_stride])
+        return out, rest
+    def chunking(self, x: torch.Tensor) -> Tuple[torch.Tensor, int]:
+        out, rest = self.pad_chunk(x)
+        batch_size, feat_dim, seq_len = out.shape
+        segments1 = out[:, :, : -self.chunk_stride].contiguous().view(batch_size, feat_dim, -1, self.chunk_size)
+        segments2 = out[:, :, self.chunk_stride :].contiguous().view(batch_size, feat_dim, -1, self.chunk_size)
+        out = torch.cat([segments1, segments2], dim=3)
+        out = out.view(batch_size, feat_dim, -1, self.chunk_size).transpose(2, 3).contiguous()
+        return out, rest
+    def merging(self, x: torch.Tensor, rest: int) -> torch.Tensor:
+        batch_size, dim, _, _ = x.shape
+        out = x.transpose(2, 3).contiguous().view(batch_size, dim, -1, self.chunk_size * 2)
+        out1 = out[:, :, :, : self.chunk_size].contiguous().view(batch_size, dim, -1)[:, :, self.chunk_stride :]
+        out2 = out[:, :, :, self.chunk_size :].contiguous().view(batch_size, dim, -1)[:, :, : -self.chunk_stride]
+        out = out1 + out2
+        if rest > 0:
+            out = out[:, :, :-rest]
+        out = out.contiguous()
+        return out
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, rest = self.chunking(x)
+        batch_size, _, dim1, dim2 = x.shape
+        out = x
+        for row_rnn, row_norm, col_rnn, col_norm in zip(self.row_rnn, self.row_norm, self.col_rnn, self.col_norm):
+            row_in = out.permute(0, 3, 2, 1).contiguous().view(batch_size * dim2, dim1, -1).contiguous()
+            row_out = row_rnn(row_in)
+            row_out = row_out.view(batch_size, dim2, dim1, -1).permute(0, 3, 2, 1).contiguous()
+            row_out = row_norm(row_out)
+            out = out + row_out
+            col_in = out.permute(0, 2, 3, 1).contiguous().view(batch_size * dim1, dim2, -1).contiguous()
+            col_out = col_rnn(col_in)
+            col_out = col_out.view(batch_size, dim1, dim2, -1).permute(0, 3, 1, 2).contiguous()
+            col_out = col_norm(col_out)
+            out = out + col_out
+        out = self.conv(out)
+        out = self.merging(out, rest)
+        out = out.transpose(1, 2).contiguous()
+        return out
+class AutoPool(nn.Module):
+    def __init__(self, pool_dim: int = 1) -> None:
+        super(AutoPool, self).__init__()
+        self.pool_dim: int = pool_dim
+        self.softmax: nn.modules.Module = nn.Softmax(dim=pool_dim)
+        self.register_parameter("alpha", nn.Parameter(torch.ones(1)))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        weight = self.softmax(torch.mul(x, self.alpha))
+        out = torch.sum(torch.mul(x, weight), dim=self.pool_dim)
+        return out
+class SquimObjective(nn.Module):
+    """Speech Quality and Intelligibility Measures (SQUIM) model that predicts **objective** metric scores
+    for speech enhancement (e.g., STOI, PESQ, and SI-SDR).
+    Args:
+        encoder (torch.nn.Module): Encoder module to transform 1D waveform to 2D feature representation.
+        dprnn (torch.nn.Module): DPRNN module to model sequential feature.
+        branches (torch.nn.ModuleList): Transformer branches in which each branch estimate one objective metirc score.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        dprnn: nn.Module,
+        branches: nn.ModuleList,
+    ):
+        super(SquimObjective, self).__init__()
+        self.encoder = encoder
+        self.dprnn = dprnn
+        self.branches = branches
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """
+        Args:
+            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.
+        Returns:
+            List(torch.Tensor): List of score Tenosrs. Each Tensor is with dimension `(batch,)`.
+        """
+        if x.ndim != 2:
+            raise ValueError(f"The input must be a 2D Tensor. Found dimension {x.ndim}.")
+        x = x / (torch.mean(x**2, dim=1, keepdim=True) ** 0.5 * 20)
+        out = self.encoder(x)
+        out = self.dprnn(out)
+        scores = []
+        for branch in self.branches:
+            scores.append(branch(out).squeeze(dim=1))
+        return scores
+def _create_branch(d_model: int, nhead: int, metric: str) -> nn.modules.Module:
+    """Create branch module after DPRNN model for predicting metric score.
+    Args:
+        d_model (int): The number of expected features in the input.
+        nhead (int): Number of heads in the multi-head attention model.
+        metric (str): The metric name to predict.
+    Returns:
+        (nn.Module): Returned module to predict corresponding metric score.
+    """
+    layer1 = nn.TransformerEncoderLayer(d_model, nhead, d_model * 4, dropout=0.0, batch_first=True)
+    layer2 = AutoPool()
+    if metric == "stoi":
+        layer3 = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.PReLU(),
+            nn.Linear(d_model, 1),
+            RangeSigmoid(),
+        )
+    elif metric == "pesq":
+        layer3 = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.PReLU(),
+            nn.Linear(d_model, 1),
+            RangeSigmoid(val_range=PESQRange),
+        )
+    else:
+        layer3: nn.modules.Module = nn.Sequential(nn.Linear(d_model, d_model), nn.PReLU(), nn.Linear(d_model, 1))
+    return nn.Sequential(layer1, layer2, layer3)
+def squim_objective_model(
+    feat_dim: int,
+    win_len: int,
+    d_model: int,
+    nhead: int,
+    hidden_dim: int,
+    num_blocks: int,
+    rnn_type: str,
+    chunk_size: int,
+    chunk_stride: Optional[int] = None,
+) -> SquimObjective:
+    """Build a custome :class:`torchaudio.prototype.models.SquimObjective` model.
+    Args:
+        feat_dim (int, optional): The feature dimension after Encoder module.
+        win_len (int): Kernel size in the Encoder module.
+        d_model (int): The number of expected features in the input.
+        nhead (int): Number of heads in the multi-head attention model.
+        hidden_dim (int): Hidden dimension in the RNN layer of DPRNN.
+        num_blocks (int): Number of DPRNN layers.
+        rnn_type (str): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"].
+        chunk_size (int): Chunk size of input for DPRNN.
+        chunk_stride (int or None, optional): Stride of chunk input for DPRNN.
+    """
+    if chunk_stride is None:
+        chunk_stride = chunk_size // 2
+    encoder = Encoder(feat_dim, win_len)
+    dprnn = DPRNN(feat_dim, hidden_dim, num_blocks, rnn_type, d_model, chunk_size, chunk_stride)
+    branches = nn.ModuleList(
+        [
+            _create_branch(d_model, nhead, "stoi"),
+            _create_branch(d_model, nhead, "pesq"),
+            _create_branch(d_model, nhead, "sisdr"),
+        ]
+    )
+    return SquimObjective(encoder, dprnn, branches)
+def squim_objective_base() -> SquimObjective:
+    """Build :class:`torchaudio.prototype.models.SquimObjective` model with default arguments."""
+    return squim_objective_model(
+        feat_dim=256,
+        win_len=64,
+        d_model=256,
+        nhead=4,
+        hidden_dim=256,
+        num_blocks=2,
+        rnn_type="LSTM",
+        chunk_size=71,
+    )