PyPI - lt-tensor - Versions diffs - 0.0.1a37__tar.gz → 0.0.1a39__tar.gz - Mend

lt-tensor 0.0.1a37tar.gz → 0.0.1a39tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lt-tensor
-Version: 0.0.1a37
+Version: 0.0.1a39
 Summary: General utilities for PyTorch and others. Built for general use.
 Home-page: https://github.com/gr1336/lt-tensor/
 Author: gr1336

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.0.1a37"
+__version__ = "0.0.1a39"
 from . import (
     lr_schedulers,

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/losses.py RENAMED Viewed

@@ -133,7 +133,7 @@ class MultiMelScaleLoss(Model):
         loss_mel_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
         loss_pitch_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
         loss_rms_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
-        center: bool = True,
+        center: bool = False,
         power: float = 1.0,
         normalized: bool = False,
         pad_mode: str = "reflect",
@@ -149,6 +149,7 @@ class MultiMelScaleLoss(Model):
         lambda_rms: float = 1.0,
         lambda_pitch: float = 1.0,
         weight: float = 1.0,
+        mel: Literal["librosa", "torch"] = "torch",
     ):
         super().__init__()
         assert (
@@ -188,6 +189,7 @@ class MultiMelScaleLoss(Model):
             onesided,
             std,
             mean,
+            mel,
         )
     def _setup_mels(
@@ -206,6 +208,7 @@ class MultiMelScaleLoss(Model):
         onesided: Optional[bool],
         std: int,
         mean: int,
+        mel: str,
     ):
         assert (
             len(n_mels)
@@ -224,6 +227,7 @@ class MultiMelScaleLoss(Model):
             pad_mode=pad_mode,
             std=std,
             mean=mean,
+            mel_default=mel,
         )
         self.mel_spectrograms: List[AudioProcessor] = nn.ModuleList(
             [
@@ -247,12 +251,14 @@ class MultiMelScaleLoss(Model):
     def forward(
         self, input_wave: torch.Tensor, target_wave: torch.Tensor
     ) -> torch.Tensor:
-        assert self.use_istft_norm or input_wave.shape[-1] == target_wave.shape[-1]
+        assert self.use_istft_norm or input_wave.shape[-1] == target_wave.shape[-1], (
+            f"Size mismatch! input_wave {input_wave.shape[-1]} must match target_wave: {target_wave.shape[-1]}. "
+            "Alternatively 'use_istft_norm' can be set to Trie with will automatically force the audio to that size."
+        )
         target_wave = target_wave.to(input_wave.device)
         losses = 0.0
         for M in self.mel_spectrograms:
-            # Apply normalization if requested
-            if self.use_istft_norm:
+            if self.use_istft_norm and input_proc.shape[-1] != target_proc.shape[-1]:
                 input_proc = M.istft_norm(input_wave, length=target_wave.shape[-1])
                 target_proc = M.istft_norm(target_wave, length=target_wave.shape[-1])
             else:

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/diffwave/__init__.py RENAMED Viewed

@@ -1,14 +1,15 @@
-__all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsample", "DiffusionEmbedding"]
+__all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsampler", "DiffusionEmbedding"]
 import numpy as np
 from lt_tensor.torch_commons import *
 from torch.nn import functional as F
 from lt_tensor.config_templates import ModelConfig
 from lt_tensor.torch_commons import *
-from lt_tensor.model_zoo.convs import ConvNets, Conv1dEXT
+from lt_tensor.model_zoo.convs import ConvNets, ConvEXT
 from lt_tensor.model_base import Model
 from math import sqrt
 from lt_utils.common import *
+from lt_tensor.misc_utils import log_tensor
 class DiffWaveConfig(ModelConfig):
@@ -21,12 +22,8 @@ class DiffWaveConfig(ModelConfig):
     unconditional = False
     apply_norm: Optional[Literal["weight", "spectral"]] = None
     apply_norm_resblock: Optional[Literal["weight", "spectral"]] = None
-    noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist()
+    noise_schedule: list[int] = np.linspace(1e-4, 0.05, 25).tolist()
     # settings for auto-fixes
-    interpolate = False
-    interpolation_mode: Literal[
-        "nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"
-    ] = "nearest"
     def __init__(
         self,
@@ -37,16 +34,6 @@ class DiffWaveConfig(ModelConfig):
         dilation_cycle_length=10,
         unconditional=False,
         noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist(),
-        interpolate_cond=False,
-        interpolation_mode: Literal[
-            "nearest",
-            "linear",
-            "bilinear",
-            "bicubic",
-            "trilinear",
-            "area",
-            "nearest-exact",
-        ] = "nearest",
         apply_norm: Optional[Literal["weight", "spectral"]] = None,
         apply_norm_resblock: Optional[Literal["weight", "spectral"]] = None,
     ):
@@ -58,8 +45,6 @@ class DiffWaveConfig(ModelConfig):
             "residual_channels": residual_channels,
             "unconditional": unconditional,
             "noise_schedule": noise_schedule,
-            "interpolate": interpolate_cond,
-            "interpolation_mode": interpolation_mode,
             "apply_norm": apply_norm,
             "apply_norm_resblock": apply_norm_resblock,
         }
@@ -102,19 +87,34 @@ class DiffusionEmbedding(Model):
         return table
-class SpectrogramUpsample(Model):
+class SpectrogramUpsampler(Model):
     def __init__(self):
         super().__init__()
-        self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
-        self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
-        self.activation = nn.LeakyReLU(0.4)
+        self.conv_net = nn.Sequential(
+            ConvEXT(
+                1,
+                1,
+                [3, 32],
+                stride=[1, 16],
+                padding=[1, 8],
+                module_type="2d",
+                transpose=True,
+            ),
+            nn.LeakyReLU(0.1),
+            ConvEXT(
+                1,
+                1,
+                [3, 32],
+                stride=[1, 16],
+                padding=[1, 8],
+                module_type="2d",
+                transpose=True,
+            ),
+            nn.LeakyReLU(0.1),
+        )
-    def forward(self, x):
-        x = torch.unsqueeze(x, 1)
-        x = self.activation(self.conv1(x))
-        x = self.activation(self.conv2(x))
-        x = torch.squeeze(x, 1)
-        return x
+    def forward(self, x: Tensor):
+        return self.conv_net(x.unsqueeze(0)).squeeze(1)
 class ResidualBlock(Model):
@@ -133,7 +133,7 @@ class ResidualBlock(Model):
         :param uncond: disable spectrogram conditional
         """
         super().__init__()
-        self.dilated_conv = Conv1dEXT(
+        self.dilated_conv = ConvEXT(
             residual_channels,
             2 * residual_channels,
             3,
@@ -142,18 +142,18 @@ class ResidualBlock(Model):
             apply_norm=apply_norm,
         )
         self.diffusion_projection = nn.Linear(512, residual_channels)
-        if not uncond:  # conditional model
-            self.conditioner_projection = Conv1dEXT(
+        self.uncoditional = uncond
+        self.conditioner_projection = None
+        if not uncond:
+            self.conditioner_projection = ConvEXT(
                 n_mels,
                 2 * residual_channels,
                 1,
                 apply_norm=apply_norm,
             )
-        else:  # unconditional model
-            self.conditioner_projection = None
-        self.output_projection = Conv1dEXT(
-            residual_channels, 2 * residual_channels, 1, apply_norm == apply_norm
+        self.output_projection = ConvEXT(
+            residual_channels, 2 * residual_channels, 1, apply_norm=apply_norm
         )
     def forward(
@@ -164,20 +164,15 @@ class ResidualBlock(Model):
     ):
         diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
-        y = x + diffusion_step
-        if (
-            conditioner is None or self.conditioner_projection is None
-        ):  # using a unconditional model
-            y = self.dilated_conv(y)
-        else:
-            conditioner = self.conditioner_projection(conditioner)
-            y = self.dilated_conv(y) + conditioner
-        gate, filter = torch.chunk(y, 2, dim=1)
-        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = (x + diffusion_step).squeeze(1)
+        y = self.dilated_conv(y)
+        if not self.uncoditional and conditioner is not None:
+            y = y + self.conditioner_projection(conditioner)
+        gate, _filter = y.chunk(2, dim=1)
+        y = gate.sigmoid() * _filter.tanh()
         y = self.output_projection(y)
-        residual, skip = torch.chunk(y, 2, dim=1)
+        residual, skip = y.chunk(2, dim=1)
         return (x + residual) / sqrt(2.0), skip
@@ -186,19 +181,17 @@ class DiffWave(Model):
         super().__init__()
         self.params = params
         self.n_hop = self.params.hop_samples
-        self.interpolate = self.params.interpolate
-        self.interpolate_mode = self.params.interpolation_mode
-        self.input_projection = Conv1dEXT(
+        self.input_projection = ConvEXT(
             in_channels=1,
             out_channels=params.residual_channels,
             kernel_size=1,
             apply_norm=self.params.apply_norm,
+            activation_out=nn.LeakyReLU(0.1),
         )
         self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
-        if self.params.unconditional:  # use unconditional model
-            self.spectrogram_upsample = None
-        else:
-            self.spectrogram_upsample = SpectrogramUpsample()
+        self.spectrogram_upsampler = (
+            SpectrogramUpsampler() if not self.params.unconditional else None
+        )
         self.residual_layers = nn.ModuleList(
             [
@@ -212,18 +205,18 @@ class DiffWave(Model):
                 for i in range(params.residual_layers)
             ]
         )
-        self.skip_projection = Conv1dEXT(
+        self.skip_projection = ConvEXT(
             in_channels=params.residual_channels,
             out_channels=params.residual_channels,
             kernel_size=1,
             apply_norm=self.params.apply_norm,
+            activation_out=nn.LeakyReLU(0.1),
         )
-        self.output_projection = Conv1dEXT(
-            params.residual_channels, 1, 1, apply_norm=self.params.apply_norm
+        self.output_projection = ConvEXT(
+            params.residual_channels, 1, 1, apply_norm=self.params.apply_norm, init_weights=True,
         )
         self.activation = nn.LeakyReLU(0.1)
-        self.r_sqrt = sqrt(len(self.residual_layers))
-        nn.init.zeros_(self.output_projection.weight)
+        self._res_d = sqrt(len(self.residual_layers))
     def forward(
         self,
@@ -231,31 +224,25 @@ class DiffWave(Model):
         diffusion_step: Tensor,
         spectrogram: Optional[Tensor] = None,
     ):
-        T = x.shape[-1]
-        if x.ndim == 2:
-            x = audio.unsqueeze(1)
-        x = self.activation(self.input_projection(x))
+        if not self.params.unconditional:
+            assert spectrogram is not None
+        if audio.ndim < 3:
+            if audio.ndim == 2:
+                audio = audio.unsqueeze(1)
+            else:
+                audio = audio.unsqueeze(0).unsqueeze(0)
+        x = self.input_projection(audio)
         diffusion_step = self.diffusion_embedding(diffusion_step)
-        if spectrogram is not None and self.spectrogram_upsample is not None:
-            if self.auto_interpolate:
-                # a little heavy, but helps a lot to fix mismatched shapes,
-                # not always recommended due to data loss
-                spectrogram = F.interpolate(
-                    input=spectrogram,
-                    size=int(T * self.n_hop),
-                    mode=self.interpolate_mode,
-                )
-            spectrogram = self.spectrogram_upsample(spectrogram)
+        if not self.params.unconditional:  # use conditional model
+            spectrogram = self.spectrogram_upsampler(spectrogram)
-        skip = None
+        skip = torch.zeros_like(x, device=x.device)
         for i, layer in enumerate(self.residual_layers):
             x, skip_connection = layer(x, diffusion_step, spectrogram)
-            if i == 0:
-                skip = skip_connection
-            else:
-                skip = skip_connection + skip
-        x = skip / self.r_sqrt
-        x = self.activation(self.skip_projection(x))
+            skip += skip_connection
+        x = skip / self._res_d
+        x = self.skip_projection(x)
         x = self.output_projection(x)
         return x

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/convs.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__all__ = ["ConvNets", "Conv1dEXT"]
+__all__ = ["ConvNets", "ConvEXT"]
 import math
 from lt_utils.common import *
 import torch.nn.functional as F
@@ -6,6 +6,7 @@ from lt_tensor.torch_commons import *
 from lt_tensor.model_base import Model
 from lt_tensor.misc_utils import log_tensor
 from lt_tensor.model_zoo.fusion import AdaFusion1D, AdaIN1D
+from lt_utils.misc_utils import default
 def spectral_norm_select(module: nn.Module, enabled: bool):
@@ -52,10 +53,7 @@ class ConvNets(Model):
             m.weight.data.normal_(mean, std)
-class Conv1dEXT(ConvNets):
-    # TODO: Use this module to replace all that are using normalizations, mostly those in `audio_models`
+class ConvEXT(ConvNets):
     def __init__(
         self,
         in_channels: int,
@@ -72,6 +70,10 @@ class Conv1dEXT(ConvNets):
         apply_norm: Optional[Literal["weight", "spectral"]] = None,
         activation_in: nn.Module = nn.Identity(),
         activation_out: nn.Module = nn.Identity(),
+        module_type: Literal["1d", "2d", "3d"] = "1d",
+        transpose: bool = False,
+        weight_init: Optional[Callable[[nn.Module], None]] = None,
+        init_weights: bool = True,
         *args,
         **kwargs,
     ):
@@ -91,23 +93,30 @@ class Conv1dEXT(ConvNets):
             device=device,
             dtype=dtype,
         )
+        match module_type.lower():
+            case "1d":
+                md = nn.Conv1d if not transpose else nn.ConvTranspose1d
+            case "2d":
+                md = nn.Conv2d if not transpose else nn.ConvTranspose2d
+            case "3d":
+                md = nn.Conv3d if not transpose else nn.ConvTranspose3d
+            case _:
+                raise ValueError(
+                    f"module_type {module_type} is not a valid module type! use '1d', '2d' or '3d'"
+                )
         if apply_norm is None:
-            self.cnn = nn.Conv1d(**cnn_kwargs)
-            self.has_wn = False
+            self.cnn = md(**cnn_kwargs)
         else:
-            self.has_wn = True
             if apply_norm == "spectral":
-                self.cnn = spectral_norm(nn.Conv1d(**cnn_kwargs))
+                self.cnn = spectral_norm(md(**cnn_kwargs))
             else:
-                self.cnn = weight_norm(nn.Conv1d(**cnn_kwargs))
+                self.cnn = weight_norm(md(**cnn_kwargs))
         self.actv_in = activation_in
         self.actv_out = activation_out
-        self.cnn.apply(self.init_weights)
+        if init_weights:
+            weight_init = default(weight_init, self.init_weights)
+            self.cnn.apply(weight_init)
     def forward(self, input: Tensor):
         return self.actv_out(self.cnn(self.actv_in(input)))
-    def remove_norms(self, name="weight"):
-        if self.has_wn:
-            remove_norm(self.cnn, name)
-            self.has_wn = False

lt_tensor-0.0.1a39/lt_tensor/model_zoo/losses/_envelope_disc/__init__.py ADDED Viewed

@@ -0,0 +1,116 @@
+""" Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/envelope.py
+MIT License
+Copyright (c) 2025 Taseoo Park
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from lt_utils.common import *
+from lt_tensor.torch_commons import *
+from lt_tensor.model_base import Model
+class Envelope(Model):
+    def __init__(self, max_freq: int, sample_rate: Number = 24000, cut_off: int = 0):
+        super().__init__()
+        self.sr = sample_rate
+        self.max_freq = max_freq
+        self.setup_low_pass_fn(max_freq, cut_off)
+    def forward(self, x: torch.Tensor):
+        if not self.max_freq:
+            return x
+        return self.lp_fn(x)
+    def _ft_signal(self, signal: torch.Tensor):
+        filtered_signal = self.butterwort_lowpass_filter(signal)
+        return torch.abs(self.hilbert(filtered_signal))
+    def setup_low_pass_fn(self, max_freq: int, cutoff: int = 0):
+        self.max_freq = int(max_freq)
+        cutoff = self.max_freq if cutoff == 0 else cutoff
+        self.lp_fn = self.hilbert if self.max_freq in [-1, 1] else self._ft_signal
+        self.setup_butterwort_lowpass_coefficients(cutoff)
+    def hilbert(self, signal: Tensor) -> Tensor:
+        """Implementing the Hilbert transform manually"""
+        N = signal.shape[2]  # Signal length
+        FFT_signal = torch.fft.fft(signal, axis=2)
+        h = torch.zeros_like(
+            signal
+        )  # Generate an array with the same shape as the signal
+        if N % 2 == 0:
+            h[:, 0, 0] = 1
+            h[:, 0, N // 2] = 1
+            h[:, 0, 1 : N // 2] = 2
+        else:
+            h[:, 0, 0] = 1
+            h[:, 0, 1 : (N + 1) // 2] = 2
+        out: Tensor = torch.fft.ifft(FFT_signal * h, axis=2)
+        if self.max_freq == -1:
+            return -out.abs()
+        return -out.abs()
+    def butterwort_lowpass_filter(self, signal):
+        filtered_signal = torch.zeros_like(signal)
+        # Applying the filter to the signal
+        for n in range(len(signal)):
+            if n < 2:
+                filtered_signal[n] = self.lp_coef_a[0] * signal[n]
+            else:
+                filtered_signal[n] = (
+                    self.lp_coef_b[0] * signal[n]
+                    + self.lp_coef_b[1] * signal[n - 1]
+                    + self.lp_coef_b[2] * signal[n - 2]
+                    - self.lp_coef_a[1] * filtered_signal[n - 1]
+                    - self.lp_coef_a[2] * filtered_signal[n - 2]
+                )
+        return filtered_signal
+    def setup_butterwort_lowpass_coefficients(self, cutoff: int):
+        cutoff = torch.tensor([cutoff], dtype=torch.float64)
+        fs = torch.tensor([self.sr], dtype=torch.float64)
+        omega = torch.tan(torch.pi * cutoff / fs)
+        # Convert float 2 to tensor
+        sqrt2 = torch.tensor(2.0, dtype=torch.float64).sqrt()
+        sq_omega = sqrt2 * omega + omega**2
+        # Transfer function coefficients using the bilinear transform
+        a = 2 * (omega**2 - 1) / (1 + sq_omega)
+        self.register_buffer(
+            "lp_coef_a",
+            torch.tensor(
+                [1.0, a.item(), ((1 - sq_omega) / (1 + sq_omega)).item()],
+                dtype=torch.float64,
+                device=self.device,
+            ),
+        )
+        b = omega**2 / (1 + sq_omega)
+        self.register_buffer(
+            "lp_coef_b",
+            torch.tensor(
+                [b.item(), (2 * b).item(), b.item()],
+                dtype=torch.float64,
+                device=self.device,
+            ),
+        )

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/losses/discriminators.py RENAMED Viewed

@@ -7,6 +7,7 @@ from lt_tensor.model_base import Model
 from lt_tensor.model_zoo.convs import ConvNets
 from torch.nn import functional as F
 from torchaudio import transforms as T
+from lt_tensor.model_zoo.losses._envelope_disc import Envelope
 MULTI_DISC_OUT_TYPE: TypeAlias = Tuple[
     List[Tensor],
@@ -313,7 +314,7 @@ class DiscriminatorS(ConvNets):
         return x.flatten(1, -1), fmap
-class MultiScaleDiscriminator(ConvNets):
+class MultiScaleDiscriminator(_MultiDiscriminatorT):
     def __init__(
         self,
         discriminator_channel_multi: Number = 1,
@@ -352,102 +353,71 @@ class MultiScaleDiscriminator(ConvNets):
         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class EnvelopeExtractor(Model):
-    """Extracts the amplitude envelope of the audio signal."""
+class DiscriminatorE(ConvNets):
+    """Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/models.py"""
-    def __init__(self, kernel_size=101):
-        super().__init__()
-        # Lowpass filter for smoothing envelope (moving average)
-        self.kernel_size = kernel_size
-        self.register_buffer("kernel", torch.ones(1, 1, kernel_size) / kernel_size)
-    def forward(self, x: Tensor):
-        # x: (B, 1, T) -> abs(x)
-        envelope = torch.abs(x)
-        # Apply low-pass smoothing (via conv1d)
-        envelope = F.pad(
-            envelope, (self.kernel_size // 2, self.kernel_size // 2), mode="reflect"
-        )
-        envelope = F.conv1d(envelope, self.kernel)
-        return envelope
-class DiscriminatorEnvelope(ConvNets):
     def __init__(
         self,
-        use_spectral_norm=False,
+        max_freq: int,
         discriminator_channel_multi: Number = 1,
-        kernel_size: int = 101,
+        sample_rate: int = 24000,
+        use_spectral_norm: bool = False,
     ):
         super().__init__()
-        norm_f = weight_norm if not use_spectral_norm else spectral_norm
-        self.extractor = EnvelopeExtractor(kernel_size=kernel_size)
+        self.max_freq = max_freq
+        norm_f = spectral_norm if use_spectral_norm else weight_norm
         dsc = lambda x: int(x * discriminator_channel_multi)
         self.convs = nn.ModuleList(
             [
-                norm_f(nn.Conv1d(1, dsc(64), 15, stride=1, padding=7)),
-                norm_f(
-                    nn.Conv1d(dsc(64), dsc(128), 41, stride=2, groups=4, padding=20)
-                ),
-                norm_f(
-                    nn.Conv1d(dsc(128), dsc(256), 41, stride=2, groups=16, padding=20)
-                ),
-                norm_f(
-                    nn.Conv1d(dsc(256), dsc(512), 41, stride=4, groups=16, padding=20)
-                ),
-                norm_f(
-                    nn.Conv1d(dsc(512), dsc(512), 41, stride=4, groups=16, padding=20)
-                ),
-                norm_f(nn.Conv1d(dsc(512), dsc(512), 5, stride=1, padding=2)),
+                norm_f(nn.Conv1d(1, dsc(128), 15, 1, padding=7)),
+                norm_f(nn.Conv1d(dsc(128), dsc(128), 41, 2, groups=4, padding=20)),
+                norm_f(nn.Conv1d(dsc(128), dsc(256), 41, 2, groups=16, padding=20)),
+                norm_f(nn.Conv1d(dsc(256), dsc(512), 41, 4, groups=16, padding=20)),
+                norm_f(nn.Conv1d(dsc(512), dsc(1024), 41, 4, groups=16, padding=20)),
+                norm_f(nn.Conv1d(dsc(1024), dsc(1024), 41, 1, groups=16, padding=20)),
+                norm_f(nn.Conv1d(dsc(1024), dsc(1024), 5, 1, padding=2)),
             ]
         )
-        self.conv_post = norm_f(nn.Conv1d(dsc(512), 1, 3, stride=1, padding=1))
+        self.conv_post = norm_f(nn.Conv1d(dsc(1024), 1, 3, 1, padding=1))
+        self.envelope = Envelope(max_freq=self.max_freq, sample_rate=sample_rate)
         self.activation = nn.LeakyReLU(0.1)
-    def forward(self, x):
-        # Input: raw audio (B, 1, T)
-        x = self.extractor(x)
+    def forward(self, x: Tensor):
         fmap = []
-        for layer in self.convs:
-            x = self.activation(layer(x))
+        for l in self.convs:
+            x = self.envelope(x)
+            x = self.activation(l(x))
             fmap.append(x)
         x = self.conv_post(x)
         fmap.append(x)
-        return x.flatten(1), fmap
+        return x.flatten(start_dim=1, end_dim=-1), fmap
 class MultiEnvelopeDiscriminator(_MultiDiscriminatorT):
+    """Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/models.py"""
     def __init__(
         self,
-        use_spectral_norm: bool = False,
         discriminator_channel_multi: Number = 1,
     ):
         super().__init__()
+        f_times_values = [-1, 0, 1, 300, 500]
         self.discriminators = nn.ModuleList(
-            [
-                DiscriminatorEnvelope(
-                    use_spectral_norm, discriminator_channel_multi
-                ),  # raw envelope
-                DiscriminatorEnvelope(use_spectral_norm),  # downsampled once
-                DiscriminatorEnvelope(use_spectral_norm),  # downsampled twice
-            ]
-        )
-        self.meanpools = nn.ModuleList(
-            [nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
+            [DiscriminatorE(f, discriminator_channel_multi) for f in f_times_values]
         )
     def forward(self, y, y_hat):
-        y_d_rs, y_d_gs = [], []
-        fmap_rs, fmap_gs = [], []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                y = self.meanpools[i - 1](y)
-                y_hat = self.meanpools[i - 1](y_hat)
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
             y_d_r, fmap_r = d(y)
             y_d_g, fmap_g = d(y_hat)
             y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
             fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
             fmap_gs.append(fmap_g)
         return y_d_rs, y_d_gs, fmap_rs, fmap_gs

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/processors/audio.py RENAMED Viewed

@@ -77,7 +77,7 @@ class AudioProcessorConfig(ModelConfig):
     def post_process(self):
         self.n_stft = self.n_fft // 2 + 1
         # some functions needs this to be a non-zero or not None value.
-        self.f_min = max(self.f_min, (self.sample_rate / (self.n_fft - 1)) * 2)
+        self.default_f_min = max(self.f_min, (self.sample_rate / (self.n_fft - 1)) * 2)
         self.default_f_max = min(
             default(self.f_max, self.sample_rate // 2), self.sample_rate // 2
         )
@@ -202,6 +202,8 @@ class AudioProcessor(Model):
         *args,
         **kwargs,
     ):
+        if wave.ndim == 1:
+            wave = wave.unsqueeze(0)
         wave = torch.nn.functional.pad(
             wave.unsqueeze(1),
             (self.mel_lib_padding, self.mel_lib_padding),
@@ -352,7 +354,7 @@ class AudioProcessor(Model):
         sr = default(sr, self.cfg.sample_rate)
         frame_length = default(frame_length, self.cfg.n_fft)
         fmin = max(
-            default(fmin, self.cfg.f_min), self.calc_pitch_fmin(sr, frame_length)
+            default(fmin, self.cfg.default_f_min), self.calc_pitch_fmin(sr, frame_length)
         )
         fmax = min(max(default(fmax, self.cfg.default_f_max), fmin + 1), sr // 2)
         hop_length = default(hop_length, self.cfg.hop_length)

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lt-tensor
-Version: 0.0.1a37
+Version: 0.0.1a39
 Summary: General utilities for PyTorch and others. Built for general use.
 Home-page: https://github.com/gr1336/lt-tensor/
 Author: gr1336

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/SOURCES.txt RENAMED Viewed

@@ -42,5 +42,6 @@ lt_tensor/model_zoo/losses/discriminators.py
 lt_tensor/model_zoo/losses/CQT/__init__.py
 lt_tensor/model_zoo/losses/CQT/transforms.py
 lt_tensor/model_zoo/losses/CQT/utils.py
+lt_tensor/model_zoo/losses/_envelope_disc/__init__.py
 lt_tensor/processors/__init__.py
 lt_tensor/processors/audio.py

{lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/setup.py RENAMED Viewed

@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as f:
     long_description = f.read()
 setup(
-    version="0.0.1a37",
+    version="0.0.1a39",
     name="lt-tensor",
     description="General utilities for PyTorch and others. Built for general use.",
     long_description=long_description,