PyPI - lt-tensor - Versions diffs - 0.0.1a14__py3-none-any.whl → 0.0.1a16__py3-none-any.whl - Mend

lt-tensor 0.0.1a14py3-none-any.whl → 0.0.1a16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

lt_tensor/datasets/audio.py +23 -6
lt_tensor/model_base.py +163 -123
lt_tensor/model_zoo/__init__.py +8 -6
lt_tensor/model_zoo/audio_models/__init__.py +1 -0
lt_tensor/model_zoo/audio_models/diffwave/__init__.py +3 -0
lt_tensor/model_zoo/audio_models/diffwave/model.py +201 -0
lt_tensor/model_zoo/audio_models/hifigan/__init__.py +393 -0
lt_tensor/model_zoo/audio_models/istft/__init__.py +409 -0
lt_tensor/model_zoo/basic.py +139 -0
lt_tensor/model_zoo/features.py +102 -11
lt_tensor/model_zoo/residual.py +133 -64
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a16.dist-info}/METADATA +1 -1
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a16.dist-info}/RECORD +16 -16
lt_tensor/model_zoo/discriminator.py +0 -196
lt_tensor/model_zoo/istft/__init__.py +0 -5
lt_tensor/model_zoo/istft/generator.py +0 -90
lt_tensor/model_zoo/istft/sg.py +0 -142
lt_tensor/model_zoo/istft/trainer.py +0 -618
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a16.dist-info}/WHEEL +0 -0
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a16.dist-info}/licenses/LICENSE +0 -0
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a16.dist-info}/top_level.txt +0 -0

lt_tensor/model_zoo/audio_models/diffwave/model.py ADDED Viewed

@@ -0,0 +1,201 @@
+__all__ = ["DiffWave", "SpectrogramUpsampler", "DiffusionEmbedding"]
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from math import sqrt
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+    def override(self, attrs):
+        if isinstance(attrs, dict):
+            self.__dict__.update(**attrs)
+        elif isinstance(attrs, (list, tuple, set)):
+            for attr in attrs:
+                self.override(attr)
+        elif attrs is not None:
+            raise NotImplementedError
+        return self
+params = AttrDict(
+    # Training params
+    batch_size=16,
+    learning_rate=2e-4,
+    max_grad_norm=None,
+    # Data params
+    sample_rate=22050,
+    n_mels=80,
+    n_fft=1024,
+    hop_samples=256,
+    crop_mel_frames=62,  # Probably an error in paper.
+    # Model params
+    residual_layers=30,
+    residual_channels=64,
+    dilation_cycle_length=10,
+    unconditional=False,
+    noise_schedule=np.linspace(1e-4, 0.05, 50).tolist(),
+    inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
+    # unconditional sample len
+    audio_len=22050 * 5,  # unconditional_synthesis_samples
+)
+def Conv1d(*args, **kwargs):
+    layer = nn.Conv1d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+class DiffusionEmbedding(nn.Module):
+    def __init__(self, max_steps):
+        super().__init__()
+        self.register_buffer(
+            "embedding", self._build_embedding(max_steps), persistent=False
+        )
+        self.projection1 = nn.Linear(128, 512)
+        self.projection2 = nn.Linear(512, 512)
+        self.activation = nn.SiLU()
+    def forward(self, diffusion_step):
+        if diffusion_step.dtype in [torch.int32, torch.int64]:
+            x = self.embedding[diffusion_step]
+        else:
+            x = self._lerp_embedding(diffusion_step)
+        x = self.projection1(x)
+        x = self.activation(x)
+        x = self.projection2(x)
+        x = self.activation(x)
+        return x
+    def _lerp_embedding(self, t):
+        low_idx = torch.floor(t).long()
+        high_idx = torch.ceil(t).long()
+        low = self.embedding[low_idx]
+        high = self.embedding[high_idx]
+        return low + (high - low) * (t - low_idx)
+    def _build_embedding(self, max_steps):
+        steps = torch.arange(max_steps).unsqueeze(1)  # [T,1]
+        dims = torch.arange(64).unsqueeze(0)  # [1,64]
+        table = steps * 10.0 ** (dims * 4.0 / 63.0)  # [T,64]
+        table = torch.cat([torch.sin(table), torch.cos(table)], dim=1)
+        return table
+class SpectrogramUpsampler(nn.Module):
+    def __init__(self, n_mels):
+        super().__init__()
+        self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
+        self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
+    def forward(self, x):
+        x = torch.unsqueeze(x, 1)
+        x = self.conv1(x)
+        x = F.leaky_relu(x, 0.4)
+        x = self.conv2(x)
+        x = F.leaky_relu(x, 0.4)
+        x = torch.squeeze(x, 1)
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, n_mels, residual_channels, dilation, uncond=False):
+        """
+        :param n_mels: inplanes of conv1x1 for spectrogram conditional
+        :param residual_channels: audio conv
+        :param dilation: audio conv dilation
+        :param uncond: disable spectrogram conditional
+        """
+        super().__init__()
+        self.dilated_conv = Conv1d(
+            residual_channels,
+            2 * residual_channels,
+            3,
+            padding=dilation,
+            dilation=dilation,
+        )
+        self.diffusion_projection = nn.Linear(512, residual_channels)
+        if not uncond:  # conditional model
+            self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
+        else:  # unconditional model
+            self.conditioner_projection = None
+        self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
+    def forward(self, x, diffusion_step, conditioner=None):
+        assert (conditioner is None and self.conditioner_projection is None) or (
+            conditioner is not None and self.conditioner_projection is not None
+        )
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+        y = x + diffusion_step
+        if self.conditioner_projection is None:  # using a unconditional model
+            y = self.dilated_conv(y)
+        else:
+            conditioner = self.conditioner_projection(conditioner)
+            y = self.dilated_conv(y) + conditioner
+        gate, filter = torch.chunk(y, 2, dim=1)
+        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = self.output_projection(y)
+        residual, skip = torch.chunk(y, 2, dim=1)
+        return (x + residual) / sqrt(2.0), skip
+class DiffWave(nn.Module):
+    def __init__(self, params):
+        super().__init__()
+        self.params = params
+        self.input_projection = Conv1d(1, params.residual_channels, 1)
+        self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
+        if self.params.unconditional:  # use unconditional model
+            self.spectrogram_upsampler = None
+        else:
+            self.spectrogram_upsampler = SpectrogramUpsampler(params.n_mels)
+        self.residual_layers = nn.ModuleList(
+            [
+                ResidualBlock(
+                    params.n_mels,
+                    params.residual_channels,
+                    2 ** (i % params.dilation_cycle_length),
+                    uncond=params.unconditional,
+                )
+                for i in range(params.residual_layers)
+            ]
+        )
+        self.skip_projection = Conv1d(
+            params.residual_channels, params.residual_channels, 1
+        )
+        self.output_projection = Conv1d(params.residual_channels, 1, 1)
+        nn.init.zeros_(self.output_projection.weight)
+    def forward(self, audio, diffusion_step, spectrogram=None):
+        assert (spectrogram is None and self.spectrogram_upsampler is None) or (
+            spectrogram is not None and self.spectrogram_upsampler is not None
+        )
+        x = audio.unsqueeze(1)
+        x = self.input_projection(x)
+        x = F.relu(x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        if self.spectrogram_upsampler:  # use conditional model
+            spectrogram = self.spectrogram_upsampler(spectrogram)
+        skip = None
+        for layer in self.residual_layers:
+            x, skip_connection = layer(x, diffusion_step, spectrogram)
+            skip = skip_connection if skip is None else skip_connection + skip
+        x = skip / sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = F.relu(x)
+        x = self.output_projection(x)
+        return x

lt_tensor/model_zoo/audio_models/hifigan/__init__.py ADDED Viewed

@@ -0,0 +1,393 @@
+__all__ = ["HifiganGenerator"]
+from lt_utils.common import *
+from lt_tensor.torch_commons import *
+from lt_tensor.model_zoo.residual import ConvNets
+from torch.nn import functional as F
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+class ResBlock1(ConvNets):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(self.init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(self.init_weights)
+        self.activation = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = c1(self.activation(x))
+            xt = c2(self.activation(xt))
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(ConvNets):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super().__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(self.init_weights)
+        self.activation = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        for c in self.convs:
+            xt = c(self.activation(x))
+            x = xt + x
+        return x
+class HifiganGenerator(ConvNets):
+    def __init__(self, h):
+        super().__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            nn.Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.activation = nn.LeakyReLU(0.1)
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    nn.ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(nn.Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(self.init_weights)
+        self.conv_post.apply(self.init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = self.ups[i](self.activation(x))
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = self.conv_post(self.activation(x))
+        x = torch.tanh(x)
+        return x
+class DiscriminatorP(ConvNets):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    nn.Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+        self.activation = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(ConvNets):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(ConvNets):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(nn.Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(nn.Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(nn.Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(nn.Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(nn.Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
+        self.activation = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(ConvNets):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

lt-tensor 0.0.1a14__py3-none-any.whl → 0.0.1a16__py3-none-any.whl

lt-tensor 0.0.1a14py3-none-any.whl → 0.0.1a16py3-none-any.whl