PyPI - minicpmo-utils - Versions diffs - 0.1.0__py3-none-any.whl - Mend

minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

cosyvoice/__init__.py +17 -0
cosyvoice/bin/average_model.py +93 -0
cosyvoice/bin/export_jit.py +103 -0
cosyvoice/bin/export_onnx.py +120 -0
cosyvoice/bin/inference_deprecated.py +126 -0
cosyvoice/bin/train.py +195 -0
cosyvoice/cli/__init__.py +0 -0
cosyvoice/cli/cosyvoice.py +209 -0
cosyvoice/cli/frontend.py +238 -0
cosyvoice/cli/model.py +386 -0
cosyvoice/dataset/__init__.py +0 -0
cosyvoice/dataset/dataset.py +151 -0
cosyvoice/dataset/processor.py +434 -0
cosyvoice/flow/decoder.py +494 -0
cosyvoice/flow/flow.py +281 -0
cosyvoice/flow/flow_matching.py +227 -0
cosyvoice/flow/length_regulator.py +70 -0
cosyvoice/hifigan/discriminator.py +230 -0
cosyvoice/hifigan/f0_predictor.py +58 -0
cosyvoice/hifigan/generator.py +582 -0
cosyvoice/hifigan/hifigan.py +67 -0
cosyvoice/llm/llm.py +610 -0
cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
cosyvoice/tokenizer/tokenizer.py +279 -0
cosyvoice/transformer/__init__.py +0 -0
cosyvoice/transformer/activation.py +84 -0
cosyvoice/transformer/attention.py +330 -0
cosyvoice/transformer/convolution.py +145 -0
cosyvoice/transformer/decoder.py +396 -0
cosyvoice/transformer/decoder_layer.py +132 -0
cosyvoice/transformer/embedding.py +302 -0
cosyvoice/transformer/encoder.py +474 -0
cosyvoice/transformer/encoder_layer.py +236 -0
cosyvoice/transformer/label_smoothing_loss.py +96 -0
cosyvoice/transformer/positionwise_feed_forward.py +115 -0
cosyvoice/transformer/subsampling.py +383 -0
cosyvoice/transformer/upsample_encoder.py +320 -0
cosyvoice/utils/__init__.py +0 -0
cosyvoice/utils/class_utils.py +83 -0
cosyvoice/utils/common.py +186 -0
cosyvoice/utils/executor.py +176 -0
cosyvoice/utils/file_utils.py +129 -0
cosyvoice/utils/frontend_utils.py +136 -0
cosyvoice/utils/losses.py +57 -0
cosyvoice/utils/mask.py +265 -0
cosyvoice/utils/scheduler.py +738 -0
cosyvoice/utils/train_utils.py +367 -0
cosyvoice/vllm/cosyvoice2.py +103 -0
matcha/__init__.py +0 -0
matcha/app.py +357 -0
matcha/cli.py +418 -0
matcha/hifigan/__init__.py +0 -0
matcha/hifigan/config.py +28 -0
matcha/hifigan/denoiser.py +64 -0
matcha/hifigan/env.py +17 -0
matcha/hifigan/meldataset.py +217 -0
matcha/hifigan/models.py +368 -0
matcha/hifigan/xutils.py +60 -0
matcha/models/__init__.py +0 -0
matcha/models/baselightningmodule.py +209 -0
matcha/models/components/__init__.py +0 -0
matcha/models/components/decoder.py +443 -0
matcha/models/components/flow_matching.py +132 -0
matcha/models/components/text_encoder.py +410 -0
matcha/models/components/transformer.py +316 -0
matcha/models/matcha_tts.py +239 -0
matcha/onnx/__init__.py +0 -0
matcha/onnx/export.py +181 -0
matcha/onnx/infer.py +168 -0
matcha/text/__init__.py +53 -0
matcha/text/cleaners.py +116 -0
matcha/text/numbers.py +71 -0
matcha/text/symbols.py +17 -0
matcha/train.py +122 -0
matcha/utils/__init__.py +5 -0
matcha/utils/audio.py +82 -0
matcha/utils/generate_data_statistics.py +111 -0
matcha/utils/instantiators.py +56 -0
matcha/utils/logging_utils.py +53 -0
matcha/utils/model.py +90 -0
matcha/utils/monotonic_align/__init__.py +22 -0
matcha/utils/monotonic_align/setup.py +7 -0
matcha/utils/pylogger.py +21 -0
matcha/utils/rich_utils.py +101 -0
matcha/utils/utils.py +219 -0
minicpmo/__init__.py +24 -0
minicpmo/utils.py +636 -0
minicpmo/version.py +2 -0
minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
s3tokenizer/__init__.py +153 -0
s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
s3tokenizer/assets/mel_filters.npz +0 -0
s3tokenizer/cli.py +183 -0
s3tokenizer/model.py +546 -0
s3tokenizer/model_v2.py +605 -0
s3tokenizer/utils.py +390 -0
stepaudio2/__init__.py +40 -0
stepaudio2/cosyvoice2/__init__.py +1 -0
stepaudio2/cosyvoice2/flow/__init__.py +0 -0
stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
stepaudio2/cosyvoice2/flow/flow.py +230 -0
stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
stepaudio2/cosyvoice2/transformer/attention.py +328 -0
stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
stepaudio2/cosyvoice2/utils/__init__.py +1 -0
stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
stepaudio2/cosyvoice2/utils/common.py +101 -0
stepaudio2/cosyvoice2/utils/mask.py +49 -0
stepaudio2/flashcosyvoice/__init__.py +0 -0
stepaudio2/flashcosyvoice/cli.py +424 -0
stepaudio2/flashcosyvoice/config.py +80 -0
stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow.py +198 -0
stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
stepaudio2/flashcosyvoice/utils/audio.py +77 -0
stepaudio2/flashcosyvoice/utils/context.py +28 -0
stepaudio2/flashcosyvoice/utils/loader.py +116 -0
stepaudio2/flashcosyvoice/utils/memory.py +19 -0
stepaudio2/stepaudio2.py +204 -0
stepaudio2/token2wav.py +248 -0
stepaudio2/utils.py +91 -0

matcha/models/baselightningmodule.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""
+This is a base lightning module that can be used to train a model.
+The benefit of this abstraction is that all the logic outside of model definition can be reused for different models.
+"""
+import inspect
+from abc import ABC
+from typing import Any, Dict
+import torch
+from lightning import LightningModule
+from lightning.pytorch.utilities import grad_norm
+from matcha import utils
+from matcha.utils.utils import plot_tensor
+log = utils.get_pylogger(__name__)
+class BaseLightningClass(LightningModule, ABC):
+    def update_data_statistics(self, data_statistics):
+        if data_statistics is None:
+            data_statistics = {
+                "mel_mean": 0.0,
+                "mel_std": 1.0,
+            }
+        self.register_buffer("mel_mean", torch.tensor(data_statistics["mel_mean"]))
+        self.register_buffer("mel_std", torch.tensor(data_statistics["mel_std"]))
+    def configure_optimizers(self) -> Any:
+        optimizer = self.hparams.optimizer(params=self.parameters())
+        if self.hparams.scheduler not in (None, {}):
+            scheduler_args = {}
+            # Manage last epoch for exponential schedulers
+            if "last_epoch" in inspect.signature(self.hparams.scheduler.scheduler).parameters:
+                if hasattr(self, "ckpt_loaded_epoch"):
+                    current_epoch = self.ckpt_loaded_epoch - 1
+                else:
+                    current_epoch = -1
+            scheduler_args.update({"optimizer": optimizer})
+            scheduler = self.hparams.scheduler.scheduler(**scheduler_args)
+            scheduler.last_epoch = current_epoch
+            return {
+                "optimizer": optimizer,
+                "lr_scheduler": {
+                    "scheduler": scheduler,
+                    "interval": self.hparams.scheduler.lightning_args.interval,
+                    "frequency": self.hparams.scheduler.lightning_args.frequency,
+                    "name": "learning_rate",
+                },
+            }
+        return {"optimizer": optimizer}
+    def get_losses(self, batch):
+        x, x_lengths = batch["x"], batch["x_lengths"]
+        y, y_lengths = batch["y"], batch["y_lengths"]
+        spks = batch["spks"]
+        dur_loss, prior_loss, diff_loss = self(
+            x=x,
+            x_lengths=x_lengths,
+            y=y,
+            y_lengths=y_lengths,
+            spks=spks,
+            out_size=self.out_size,
+        )
+        return {
+            "dur_loss": dur_loss,
+            "prior_loss": prior_loss,
+            "diff_loss": diff_loss,
+        }
+    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        self.ckpt_loaded_epoch = checkpoint["epoch"]  # pylint: disable=attribute-defined-outside-init
+    def training_step(self, batch: Any, batch_idx: int):
+        loss_dict = self.get_losses(batch)
+        self.log(
+            "step",
+            float(self.global_step),
+            on_step=True,
+            prog_bar=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/train_dur_loss",
+            loss_dict["dur_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/train_prior_loss",
+            loss_dict["prior_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/train_diff_loss",
+            loss_dict["diff_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        total_loss = sum(loss_dict.values())
+        self.log(
+            "loss/train",
+            total_loss,
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+        return {"loss": total_loss, "log": loss_dict}
+    def validation_step(self, batch: Any, batch_idx: int):
+        loss_dict = self.get_losses(batch)
+        self.log(
+            "sub_loss/val_dur_loss",
+            loss_dict["dur_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/val_prior_loss",
+            loss_dict["prior_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            "sub_loss/val_diff_loss",
+            loss_dict["diff_loss"],
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            sync_dist=True,
+        )
+        total_loss = sum(loss_dict.values())
+        self.log(
+            "loss/val",
+            total_loss,
+            on_step=True,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+        return total_loss
+    def on_validation_end(self) -> None:
+        if self.trainer.is_global_zero:
+            one_batch = next(iter(self.trainer.val_dataloaders))
+            if self.current_epoch == 0:
+                log.debug("Plotting original samples")
+                for i in range(2):
+                    y = one_batch["y"][i].unsqueeze(0).to(self.device)
+                    self.logger.experiment.add_image(
+                        f"original/{i}",
+                        plot_tensor(y.squeeze().cpu()),
+                        self.current_epoch,
+                        dataformats="HWC",
+                    )
+            log.debug("Synthesising...")
+            for i in range(2):
+                x = one_batch["x"][i].unsqueeze(0).to(self.device)
+                x_lengths = one_batch["x_lengths"][i].unsqueeze(0).to(self.device)
+                spks = one_batch["spks"][i].unsqueeze(0).to(self.device) if one_batch["spks"] is not None else None
+                output = self.synthesise(x[:, :x_lengths], x_lengths, n_timesteps=10, spks=spks)
+                y_enc, y_dec = output["encoder_outputs"], output["decoder_outputs"]
+                attn = output["attn"]
+                self.logger.experiment.add_image(
+                    f"generated_enc/{i}",
+                    plot_tensor(y_enc.squeeze().cpu()),
+                    self.current_epoch,
+                    dataformats="HWC",
+                )
+                self.logger.experiment.add_image(
+                    f"generated_dec/{i}",
+                    plot_tensor(y_dec.squeeze().cpu()),
+                    self.current_epoch,
+                    dataformats="HWC",
+                )
+                self.logger.experiment.add_image(
+                    f"alignment/{i}",
+                    plot_tensor(attn.squeeze().cpu()),
+                    self.current_epoch,
+                    dataformats="HWC",
+                )
+    def on_before_optimizer_step(self, optimizer):
+        self.log_dict({f"grad_norm/{k}": v for k, v in grad_norm(self, norm_type=2).items()})

matcha/models/components/__init__.py ADDED Viewed

File without changes

matcha/models/components/decoder.py ADDED Viewed

@@ -0,0 +1,443 @@
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from conformer import ConformerBlock
+from diffusers.models.activations import get_activation
+from einops import pack, rearrange, repeat
+from matcha.models.components.transformer import BasicTransformerBlock
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class Block1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.block = torch.nn.Sequential(
+            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
+            torch.nn.GroupNorm(groups, dim_out),
+            nn.Mish(),
+        )
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class Downsample1D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, inputs):
+        assert inputs.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(inputs)
+        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
+        if self.use_conv:
+            outputs = self.conv(outputs)
+        return outputs
+class ConformerWrapper(ConformerBlock):
+    def __init__(  # pylint: disable=useless-super-delegation
+        self,
+        *,
+        dim,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        conv_expansion_factor=2,
+        conv_kernel_size=31,
+        attn_dropout=0,
+        ff_dropout=0,
+        conv_dropout=0,
+        conv_causal=False,
+    ):
+        super().__init__(
+            dim=dim,
+            dim_head=dim_head,
+            heads=heads,
+            ff_mult=ff_mult,
+            conv_expansion_factor=conv_expansion_factor,
+            conv_kernel_size=conv_kernel_size,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            conv_dropout=conv_dropout,
+            conv_causal=conv_causal,
+        )
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+    ):
+        return super().forward(x=hidden_states, mask=attention_mask.bool())
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+        down_block_type="transformer",
+        mid_block_type="transformer",
+        up_block_type="transformer",
+    ):
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    self.get_block(
+                        down_block_type,
+                        output_channel,
+                        attention_head_dim,
+                        num_heads,
+                        dropout,
+                        act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for i in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    self.get_block(
+                        mid_block_type,
+                        output_channel,
+                        attention_head_dim,
+                        num_heads,
+                        dropout,
+                        act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i]
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=2 * input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    self.get_block(
+                        up_block_type,
+                        output_channel,
+                        attention_head_dim,
+                        num_heads,
+                        dropout,
+                        act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+        # nn.init.normal_(self.final_proj.weight)
+    @staticmethod
+    def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn):
+        if block_type == "conformer":
+            block = ConformerWrapper(
+                dim=dim,
+                dim_head=attention_head_dim,
+                heads=num_heads,
+                ff_mult=1,
+                conv_expansion_factor=2,
+                ff_dropout=dropout,
+                attn_dropout=dropout,
+                conv_dropout=dropout,
+                conv_kernel_size=31,
+            )
+        elif block_type == "transformer":
+            block = BasicTransformerBlock(
+                dim=dim,
+                num_attention_heads=num_heads,
+                attention_head_dim=attention_head_dim,
+                dropout=dropout,
+                activation_fn=act_fn,
+            )
+        else:
+            raise ValueError(f"Unknown block type {block_type}")
+        return block
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c")
+            mask_down = rearrange(mask_down, "b 1 t -> b t")
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=mask_down,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t")
+            mask_down = rearrange(mask_down, "b t -> b 1 t")
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c")
+            mask_mid = rearrange(mask_mid, "b 1 t -> b t")
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=mask_mid,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t")
+            mask_mid = rearrange(mask_mid, "b t -> b 1 t")
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t)
+            x = rearrange(x, "b c t -> b t c")
+            mask_up = rearrange(mask_up, "b 1 t -> b t")
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=mask_up,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t")
+            mask_up = rearrange(mask_up, "b t -> b 1 t")
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask