PyPI - lt-tensor - Versions diffs - 0.0.1a14__py3-none-any.whl → 0.0.1a15__py3-none-any.whl - Mend

lt-tensor 0.0.1a14py3-none-any.whl → 0.0.1a15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

lt_tensor/datasets/audio.py +23 -6
lt_tensor/model_base.py +163 -123
lt_tensor/model_zoo/diffwave/__init__.py +0 -0
lt_tensor/model_zoo/diffwave/model.py +200 -0
lt_tensor/model_zoo/diffwave/params.py +58 -0
lt_tensor/model_zoo/discriminator.py +269 -151
lt_tensor/model_zoo/features.py +102 -11
lt_tensor/model_zoo/istft/generator.py +6 -2
lt_tensor/model_zoo/istft/trainer.py +16 -7
lt_tensor/model_zoo/residual.py +133 -64
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a15.dist-info}/METADATA +1 -1
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a15.dist-info}/RECORD +15 -12
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a15.dist-info}/WHEEL +0 -0
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a15.dist-info}/licenses/LICENSE +0 -0
{lt_tensor-0.0.1a14.dist-info → lt_tensor-0.0.1a15.dist-info}/top_level.txt +0 -0

lt_tensor/model_zoo/diffwave/params.py ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright 2020 LMNT, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+class AttrDict(dict):
+  def __init__(self, *args, **kwargs):
+      super(AttrDict, self).__init__(*args, **kwargs)
+      self.__dict__ = self
+  def override(self, attrs):
+    if isinstance(attrs, dict):
+      self.__dict__.update(**attrs)
+    elif isinstance(attrs, (list, tuple, set)):
+      for attr in attrs:
+        self.override(attr)
+    elif attrs is not None:
+      raise NotImplementedError
+    return self
+params = AttrDict(
+    # Training params
+    batch_size=16,
+    learning_rate=2e-4,
+    max_grad_norm=None,
+    # Data params
+    sample_rate=22050,
+    n_mels=80,
+    n_fft=1024,
+    hop_samples=256,
+    crop_mel_frames=62,  # Probably an error in paper.
+    # Model params
+    residual_layers=30,
+    residual_channels=64,
+    dilation_cycle_length=10,
+    unconditional = False,
+    noise_schedule=np.linspace(1e-4, 0.05, 50).tolist(),
+    inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
+    # unconditional sample len
+    audio_len = 22050*5, # unconditional_synthesis_samples
+)

lt_tensor/model_zoo/discriminator.py CHANGED Viewed

@@ -2,85 +2,118 @@ from lt_tensor.torch_commons import *
 import torch.nn.functional as F
 from lt_tensor.model_base import Model
 from lt_utils.common import *
+from einops import rearrange
+import torchaudio
-class PeriodDiscriminator(Model):
-    def __init__(
-        self,
-        period: int,
-        use_spectral_norm=False,
-        kernel_size: int = 5,
-        stride: int = 3,
-    ):
+def get_padding(ks, d):
+    return int((ks * d - d) / 2)
+class DiscriminatorP(Model):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
         super().__init__()
         self.period = period
-        self.stride = stride
-        self.kernel_size = kernel_size
-        self.norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.channels = [32, 128, 512, 1024, 1024]
-        self.first_pass = nn.Sequential(
-            self.norm_f(
-                nn.Conv2d(
-                    1, self.channels[0], (kernel_size, 1), (stride, 1), padding=(2, 0)
-                )
-            ),
-            nn.LeakyReLU(0.1),
-        )
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
         self.convs = nn.ModuleList(
             [
-                self._get_next(self.channels[i + 1], self.channels[i], i == 3)
-                for i in range(4)
+                norm_f(
+                    nn.Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
             ]
         )
+        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+        self.activation = nn.LeakyReLU(0.1)
-        self.post_conv = nn.Conv2d(1024, 1, (stride, 1), 1, padding=(1, 0))
-    def _get_next(self, out_dim: int, last_in: int, is_last: bool = False):
-        stride = (self.stride, 1) if not is_last else 1
-        return nn.Sequential(
-            self.norm_f(
-                nn.Conv2d(
-                    last_in,
-                    out_dim,
-                    (self.kernel_size, 1),
-                    stride,
-                    padding=(2, 0),
-                )
-            ),
-            nn.LeakyReLU(0.1),
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(Model):
+    def __init__(self):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
         )
-    def forward(self, x: torch.Tensor):
-        """
-        x: (B, T)
-        """
-        b, t = x.shape
-        if t % self.period != 0:
-            pad_len = self.period - (t % self.period)
-            x = F.pad(x, (0, pad_len), mode="reflect")
-            t = t + pad_len
-        x = x.view(b, 1, t // self.period, self.period)  # (B, 1, T//P, P)
-        f_map = []
-        x = self.first_pass(x)
-        f_map.append(x)
-        for conv in self.convs:
-            x = conv(x)
-            f_map.append(x)
-        x = self.post_conv(x)
-        f_map.append(x)
-        return x.flatten(1, -1), f_map
-class ScaleDiscriminator(nn.Module):
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(Model):
     def __init__(self, use_spectral_norm=False):
         super().__init__()
         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.activation = nn.LeakyReLU(0.1)
         self.convs = nn.ModuleList(
             [
                 norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)),
@@ -92,105 +125,190 @@ class ScaleDiscriminator(nn.Module):
                 norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
             ]
         )
-        self.post_conv = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
+        self.activation = nn.LeakyReLU(0.1)
+        self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x: torch.Tensor):
-        """
-        x: (B, T)
-        """
-        f_map = []
-        x = x.unsqueeze(1)  # (B, 1, T)
-        for conv in self.convs:
-            x = self.activation(conv(x))
-            f_map.append(x)
-        x = self.post_conv(x)
-        f_map.append(x)
-        return x.flatten(1, -1), f_map
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = self.activation(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
 class MultiScaleDiscriminator(Model):
-    def __init__(self, layers: int = 3):
+    def __init__(self):
         super().__init__()
-        self.pooling = nn.AvgPool1d(4, 2, padding=2)
         self.discriminators = nn.ModuleList(
-            [ScaleDiscriminator(i == 0) for i in range(layers)]
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
         )
-    def forward(self, x: torch.Tensor):
-        """
-        x: (B, T)
-        Returns: list of outputs from each scale discriminator
-        """
-        outputs = []
-        features = []
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
         for i, d in enumerate(self.discriminators):
             if i != 0:
-                x = self.pooling(x)
-            out, f_map = d(x)
-            outputs.append(out)
-            features.append(f_map)
-        return outputs, features
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class MultiPeriodDiscriminator(Model):
-    def __init__(self, periods: List[int] = [2, 3, 5, 7, 11]):
-        super().__init__()
-        self.discriminators = nn.ModuleList([PeriodDiscriminator(p) for p in periods])
-    def forward(self, x: torch.Tensor):
+class MultiResolutionDiscriminator(Model):
+    """Source: https://github.com/gemelo-ai/vocos/blob/main/vocos/discriminators.py"""
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
         """
-        x: (B, T)
-        Returns: list of tuples of outputs from each period discriminator and the f_map.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
         """
-        # torch.log(torch.clip(x, min=clip_val))
-        out_map = []
-        feat_map = []
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorR(window_length=w, num_embeddings=num_embeddings)
+                for w in fft_sizes
+            ]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[List[torch.Tensor]],
+        List[List[torch.Tensor]],
+    ]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
         for d in self.discriminators:
-            out, feat = d(x)
-            out_map.append(out)
-            feat_map.append(feat)
-        return out_map, feat_map
-def discriminator_loss(real_out_map, fake_out_map):
-    loss = 0.0
-    rl, fl = [], []
-    for real_out, fake_out in zip(real_out_map, fake_out_map):
-        real_loss = torch.mean((1.0 - real_out) ** 2)
-        fake_loss = torch.mean(fake_out**2)
-        loss += real_loss + fake_loss
-        rl.append(real_loss.item())
-        fl.append(fake_loss.item())
-    return loss, sum(rl), sum(fl)
-def generator_adv_loss(fake_disc_outputs: List[Tensor]):
-    loss = 0.0
-    for fake_out in fake_disc_outputs:
-        fake_score = fake_out[0]
-        loss += -torch.mean(fake_score)
-    return loss
-def feature_loss(
-    fmap_r,
-    fmap_g,
-    weight=2.0,
-    loss_fn: Callable[[Tensor, Tensor], Tensor] = F.l1_loss,
-):
-    loss = 0.0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += loss_fn(rl - gl)
-    return loss * weight
-def generator_loss(disc_generated_outputs):
-    loss = 0.0
-    gen_losses = []
-    for dg in disc_generated_outputs:
-        l = torch.mean((1.0 - dg) ** 2)
-        gen_losses.append(l.item())
-        loss += l
-    return loss, gen_losses
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(Model):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = (
+            (0.0, 0.1),
+            (0.1, 0.25),
+            (0.25, 0.5),
+            (0.5, 0.75),
+            (0.75, 1.0),
+        ),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = torchaudio.transforms.Spectrogram(
+            n_fft=window_length,
+            hop_length=int(window_length * hop_factor),
+            win_length=window_length,
+            power=None,
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
+                ),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
+                ),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
+                ),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))
+                ),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(
+                num_embeddings=num_embeddings, embedding_dim=channels
+            )
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(
+            nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))
+        )
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap

lt_tensor/model_zoo/features.py CHANGED Viewed

@@ -323,8 +323,11 @@ class AudioEncoder(Model):
     def __init__(
         self,
-        channels: int = 80,
+        channels: int,
         alpha: float = 4.0,
+        feat_channels: int = 64,
+        out_features: Optional[int] = None,
+        out_channels: int = 1,
         interp_mode: Literal[
             "nearest",
             "linear",
@@ -338,16 +341,60 @@ class AudioEncoder(Model):
         self.net = nn.Sequential(
             nn.Conv1d(
-                channels, channels, kernel_size=3, stride=2, padding=5, groups=channels
+                channels, feat_channels, kernel_size=3, stride=1, padding=5, groups=1
             ),
             nn.LeakyReLU(0.1),
-            nn.Conv1d(channels, channels, kernel_size=7, stride=1, padding=1, groups=1),
+            nn.Conv1d(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=feat_channels,
+            ),
+            nn.LeakyReLU(0.1),
+            nn.Conv1d(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=feat_channels // 8,
+            ),
+            nn.LeakyReLU(0.1),
+            nn.Conv1d(
+                feat_channels,
+                feat_channels,
+                kernel_size=7,
+                stride=1,
+                padding=1,
+                groups=1,
+            ),
         )
-        self.fc = nn.Linear(channels, channels)
+        self.fc = nn.Linear(feat_channels, channels)
+        self.feat_channels = feat_channels
         self.activation = activation
         self.channels = channels
         self.mode = interp_mode
         self.alpha = alpha
+        self.post_conv = nn.Conv1d(
+            channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=True,
+        )
+        if out_features is not None:
+            self.format_out = lambda tensor: F.interpolate(
+                tensor,
+                size=out_features,
+                mode=interp_mode,
+            )
+        else:
+            self.format_out = nn.Identity()
     def forward(self, mels: Tensor, cr_audio: Tensor):
         sin = torch.asin(cr_audio)
@@ -367,14 +414,20 @@ class AudioEncoder(Model):
             .contiguous()
         )
         x = self.activation(x)
-        return self.fc(x).transpose(-1, -2)
+        xt = self.fc(x).transpose(-1, -2)
+        out = self.post_conv(xt)
+        return self.format_out(out)
 class AudioEncoderAttn(Model):
     def __init__(
         self,
-        channels: int = 80,
+        channels: int,
+        feat_channels: int = 64,
         alpha: float = 4.0,
+        out_channels: Optional[int] = None,
+        out_features: int = 1,
         interp_mode: Literal[
             "nearest",
             "linear",
@@ -388,16 +441,54 @@ class AudioEncoderAttn(Model):
         self.net = nn.Sequential(
             nn.Conv1d(
-                channels, channels, kernel_size=3, stride=2, padding=5, groups=channels
+                channels, feat_channels, kernel_size=3, stride=1, padding=1, groups=1
+            ),
+            nn.LeakyReLU(0.1),
+            nn.Conv1d(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=2,
+                padding=5,
+                groups=feat_channels,
             ),
             nn.LeakyReLU(0.1),
-            nn.Conv1d(channels, channels, kernel_size=7, stride=1, padding=1, groups=1),
+            nn.Conv1d(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=feat_channels // 8,
+            ),
+            nn.LeakyReLU(0.1),
+            nn.Conv1d(
+                feat_channels, channels, kernel_size=7, stride=1, padding=1, groups=1
+            ),
         )
         self.fusion = CrossAttentionFusion(channels, channels, 2, d_model=channels)
         self.channels = channels
         self.mode = interp_mode
         self.alpha = alpha
         self.activation = activation
+        self.post_conv = nn.Conv1d(
+            channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=True,
+        )
+        if out_features is not None:
+            self.format_out = lambda tensor: F.interpolate(
+                tensor,
+                size=out_features,
+                mode=interp_mode,
+            )
+        else:
+            self.format_out = nn.Identity()
     def forward(self, mels: Tensor, cr_audio: Tensor):
         sin = torch.asin(cr_audio)
@@ -408,9 +499,9 @@ class AudioEncoderAttn(Model):
         )
         x = self.activation(self.net(mod))
         x = F.interpolate(x, size=mels.shape[-1], mode=self.mode)
-        # Ensure contiguous before transpose
         x_t = x.transpose(-2, -1).contiguous()
         mels_t = mels.transpose(-2, -1).contiguous()
-        return self.fusion(x_t, mels_t).transpose(-2, -1)
+        xt = self.fusion(x_t, mels_t).transpose(-2, -1)
+        out = self.post_conv(xt)
+        return self.format_out(out)

lt-tensor 0.0.1a14__py3-none-any.whl → 0.0.1a15__py3-none-any.whl

lt-tensor 0.0.1a14py3-none-any.whl → 0.0.1a15py3-none-any.whl