PyPI - lt-tensor - Versions diffs - 0.0.1a36__py3-none-any.whl → 0.0.1a38__py3-none-any.whl - Mend

lt-tensor 0.0.1a36py3-none-any.whl → 0.0.1a38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

lt_tensor/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.0.1a36"
+__version__ = "0.0.1a38"
 from . import (
     lr_schedulers,

lt_tensor/losses.py CHANGED Viewed

@@ -133,7 +133,7 @@ class MultiMelScaleLoss(Model):
         loss_mel_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
         loss_pitch_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
         loss_rms_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
-        center: bool = True,
+        center: bool = False,
         power: float = 1.0,
         normalized: bool = False,
         pad_mode: str = "reflect",
@@ -149,6 +149,7 @@ class MultiMelScaleLoss(Model):
         lambda_rms: float = 1.0,
         lambda_pitch: float = 1.0,
         weight: float = 1.0,
+        mel: Literal["librosa", "torch"] = "torch",
     ):
         super().__init__()
         assert (
@@ -188,6 +189,7 @@ class MultiMelScaleLoss(Model):
             onesided,
             std,
             mean,
+            mel,
         )
     def _setup_mels(
@@ -206,6 +208,7 @@ class MultiMelScaleLoss(Model):
         onesided: Optional[bool],
         std: int,
         mean: int,
+        mel: str,
     ):
         assert (
             len(n_mels)
@@ -224,6 +227,7 @@ class MultiMelScaleLoss(Model):
             pad_mode=pad_mode,
             std=std,
             mean=mean,
+            mel_default=mel,
         )
         self.mel_spectrograms: List[AudioProcessor] = nn.ModuleList(
             [
@@ -247,12 +251,14 @@ class MultiMelScaleLoss(Model):
     def forward(
         self, input_wave: torch.Tensor, target_wave: torch.Tensor
     ) -> torch.Tensor:
-        assert self.use_istft_norm or input_wave.shape[-1] == target_wave.shape[-1]
+        assert self.use_istft_norm or input_wave.shape[-1] == target_wave.shape[-1], (
+            f"Size mismatch! input_wave {input_wave.shape[-1]} must match target_wave: {target_wave.shape[-1]}. "
+            "Alternatively 'use_istft_norm' can be set to Trie with will automatically force the audio to that size."
+        )
         target_wave = target_wave.to(input_wave.device)
         losses = 0.0
         for M in self.mel_spectrograms:
-            # Apply normalization if requested
-            if self.use_istft_norm:
+            if self.use_istft_norm and input_proc.shape[-1] != target_proc.shape[-1]:
                 input_proc = M.istft_norm(input_wave, length=target_wave.shape[-1])
                 target_proc = M.istft_norm(target_wave, length=target_wave.shape[-1])
             else:

lt_tensor/model_zoo/audio_models/diffwave/__init__.py CHANGED Viewed

@@ -1,14 +1,15 @@
-__all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsample", "DiffusionEmbedding"]
+__all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsampler", "DiffusionEmbedding"]
 import numpy as np
 from lt_tensor.torch_commons import *
 from torch.nn import functional as F
 from lt_tensor.config_templates import ModelConfig
 from lt_tensor.torch_commons import *
-from lt_tensor.model_zoo.convs import ConvNets, Conv1dEXT
+from lt_tensor.model_zoo.convs import ConvNets, ConvEXT
 from lt_tensor.model_base import Model
 from math import sqrt
 from lt_utils.common import *
+from lt_tensor.misc_utils import log_tensor
 class DiffWaveConfig(ModelConfig):
@@ -21,12 +22,8 @@ class DiffWaveConfig(ModelConfig):
     unconditional = False
     apply_norm: Optional[Literal["weight", "spectral"]] = None
     apply_norm_resblock: Optional[Literal["weight", "spectral"]] = None
-    noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist()
+    noise_schedule: list[int] = np.linspace(1e-4, 0.05, 25).tolist()
     # settings for auto-fixes
-    interpolate = False
-    interpolation_mode: Literal[
-        "nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"
-    ] = "nearest"
     def __init__(
         self,
@@ -37,16 +34,6 @@ class DiffWaveConfig(ModelConfig):
         dilation_cycle_length=10,
         unconditional=False,
         noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist(),
-        interpolate_cond=False,
-        interpolation_mode: Literal[
-            "nearest",
-            "linear",
-            "bilinear",
-            "bicubic",
-            "trilinear",
-            "area",
-            "nearest-exact",
-        ] = "nearest",
         apply_norm: Optional[Literal["weight", "spectral"]] = None,
         apply_norm_resblock: Optional[Literal["weight", "spectral"]] = None,
     ):
@@ -58,8 +45,6 @@ class DiffWaveConfig(ModelConfig):
             "residual_channels": residual_channels,
             "unconditional": unconditional,
             "noise_schedule": noise_schedule,
-            "interpolate": interpolate_cond,
-            "interpolation_mode": interpolation_mode,
             "apply_norm": apply_norm,
             "apply_norm_resblock": apply_norm_resblock,
         }
@@ -102,19 +87,34 @@ class DiffusionEmbedding(Model):
         return table
-class SpectrogramUpsample(Model):
+class SpectrogramUpsampler(Model):
     def __init__(self):
         super().__init__()
-        self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
-        self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
-        self.activation = nn.LeakyReLU(0.4)
+        self.conv_net = nn.Sequential(
+            ConvEXT(
+                1,
+                1,
+                [3, 32],
+                stride=[1, 16],
+                padding=[1, 8],
+                module_type="2d",
+                transpose=True,
+            ),
+            nn.LeakyReLU(0.1),
+            ConvEXT(
+                1,
+                1,
+                [3, 32],
+                stride=[1, 16],
+                padding=[1, 8],
+                module_type="2d",
+                transpose=True,
+            ),
+            nn.LeakyReLU(0.1),
+        )
-    def forward(self, x):
-        x = torch.unsqueeze(x, 1)
-        x = self.activation(self.conv1(x))
-        x = self.activation(self.conv2(x))
-        x = torch.squeeze(x, 1)
-        return x
+    def forward(self, x: Tensor):
+        return self.conv_net(x.unsqueeze(0)).squeeze(1)
 class ResidualBlock(Model):
@@ -133,7 +133,7 @@ class ResidualBlock(Model):
         :param uncond: disable spectrogram conditional
         """
         super().__init__()
-        self.dilated_conv = Conv1dEXT(
+        self.dilated_conv = ConvEXT(
             residual_channels,
             2 * residual_channels,
             3,
@@ -142,18 +142,18 @@ class ResidualBlock(Model):
             apply_norm=apply_norm,
         )
         self.diffusion_projection = nn.Linear(512, residual_channels)
-        if not uncond:  # conditional model
-            self.conditioner_projection = Conv1dEXT(
+        self.uncoditional = uncond
+        self.conditioner_projection = None
+        if not uncond:
+            self.conditioner_projection = ConvEXT(
                 n_mels,
                 2 * residual_channels,
                 1,
                 apply_norm=apply_norm,
             )
-        else:  # unconditional model
-            self.conditioner_projection = None
-        self.output_projection = Conv1dEXT(
-            residual_channels, 2 * residual_channels, 1, apply_norm == apply_norm
+        self.output_projection = ConvEXT(
+            residual_channels, 2 * residual_channels, 1, apply_norm=apply_norm
         )
     def forward(
@@ -164,20 +164,15 @@ class ResidualBlock(Model):
     ):
         diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
-        y = x + diffusion_step
-        if (
-            conditioner is None or self.conditioner_projection is None
-        ):  # using a unconditional model
-            y = self.dilated_conv(y)
-        else:
-            conditioner = self.conditioner_projection(conditioner)
-            y = self.dilated_conv(y) + conditioner
-        gate, filter = torch.chunk(y, 2, dim=1)
-        y = torch.sigmoid(gate) * torch.tanh(filter)
+        y = (x + diffusion_step).squeeze(1)
+        y = self.dilated_conv(y)
+        if not self.uncoditional and conditioner is not None:
+            y = y + self.conditioner_projection(conditioner)
+        gate, _filter = y.chunk(2, dim=1)
+        y = gate.sigmoid() * _filter.tanh()
         y = self.output_projection(y)
-        residual, skip = torch.chunk(y, 2, dim=1)
+        residual, skip = y.chunk(2, dim=1)
         return (x + residual) / sqrt(2.0), skip
@@ -186,19 +181,17 @@ class DiffWave(Model):
         super().__init__()
         self.params = params
         self.n_hop = self.params.hop_samples
-        self.interpolate = self.params.interpolate
-        self.interpolate_mode = self.params.interpolation_mode
-        self.input_projection = Conv1dEXT(
+        self.input_projection = ConvEXT(
             in_channels=1,
             out_channels=params.residual_channels,
             kernel_size=1,
             apply_norm=self.params.apply_norm,
+            activation_out=nn.LeakyReLU(0.1),
         )
         self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
-        if self.params.unconditional:  # use unconditional model
-            self.spectrogram_upsample = None
-        else:
-            self.spectrogram_upsample = SpectrogramUpsample()
+        self.spectrogram_upsampler = (
+            SpectrogramUpsampler() if not self.params.unconditional else None
+        )
         self.residual_layers = nn.ModuleList(
             [
@@ -212,18 +205,18 @@ class DiffWave(Model):
                 for i in range(params.residual_layers)
             ]
         )
-        self.skip_projection = Conv1dEXT(
+        self.skip_projection = ConvEXT(
             in_channels=params.residual_channels,
             out_channels=params.residual_channels,
             kernel_size=1,
             apply_norm=self.params.apply_norm,
+            activation_out=nn.LeakyReLU(0.1),
         )
-        self.output_projection = Conv1dEXT(
-            params.residual_channels, 1, 1, apply_norm=self.params.apply_norm
+        self.output_projection = ConvEXT(
+            params.residual_channels, 1, 1, apply_norm=self.params.apply_norm, init_weights=True,
         )
         self.activation = nn.LeakyReLU(0.1)
-        self.r_sqrt = sqrt(len(self.residual_layers))
-        nn.init.zeros_(self.output_projection.weight)
+        self._res_d = sqrt(len(self.residual_layers))
     def forward(
         self,
@@ -231,31 +224,25 @@ class DiffWave(Model):
         diffusion_step: Tensor,
         spectrogram: Optional[Tensor] = None,
     ):
-        T = x.shape[-1]
-        if x.ndim == 2:
-            x = audio.unsqueeze(1)
-        x = self.activation(self.input_projection(x))
+        if not self.params.unconditional:
+            assert spectrogram is not None
+        if audio.ndim < 3:
+            if audio.ndim == 2:
+                audio = audio.unsqueeze(1)
+            else:
+                audio = audio.unsqueeze(0).unsqueeze(0)
+        x = self.input_projection(audio)
         diffusion_step = self.diffusion_embedding(diffusion_step)
-        if spectrogram is not None and self.spectrogram_upsample is not None:
-            if self.auto_interpolate:
-                # a little heavy, but helps a lot to fix mismatched shapes,
-                # not always recommended due to data loss
-                spectrogram = F.interpolate(
-                    input=spectrogram,
-                    size=int(T * self.n_hop),
-                    mode=self.interpolate_mode,
-                )
-            spectrogram = self.spectrogram_upsample(spectrogram)
+        if not self.params.unconditional:  # use conditional model
+            spectrogram = self.spectrogram_upsampler(spectrogram)
-        skip = None
+        skip = torch.zeros_like(x, device=x.device)
         for i, layer in enumerate(self.residual_layers):
             x, skip_connection = layer(x, diffusion_step, spectrogram)
-            if i == 0:
-                skip = skip_connection
-            else:
-                skip = skip_connection + skip
-        x = skip / self.r_sqrt
-        x = self.activation(self.skip_projection(x))
+            skip += skip_connection
+        x = skip / self._res_d
+        x = self.skip_projection(x)
         x = self.output_projection(x)
         return x

lt_tensor/model_zoo/convs.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__all__ = ["ConvNets", "Conv1dEXT"]
+__all__ = ["ConvNets", "ConvEXT"]
 import math
 from lt_utils.common import *
 import torch.nn.functional as F
@@ -6,6 +6,7 @@ from lt_tensor.torch_commons import *
 from lt_tensor.model_base import Model
 from lt_tensor.misc_utils import log_tensor
 from lt_tensor.model_zoo.fusion import AdaFusion1D, AdaIN1D
+from lt_utils.misc_utils import default
 def spectral_norm_select(module: nn.Module, enabled: bool):
@@ -52,10 +53,7 @@ class ConvNets(Model):
             m.weight.data.normal_(mean, std)
-class Conv1dEXT(ConvNets):
-    # TODO: Use this module to replace all that are using normalizations, mostly those in `audio_models`
+class ConvEXT(ConvNets):
     def __init__(
         self,
         in_channels: int,
@@ -72,6 +70,10 @@ class Conv1dEXT(ConvNets):
         apply_norm: Optional[Literal["weight", "spectral"]] = None,
         activation_in: nn.Module = nn.Identity(),
         activation_out: nn.Module = nn.Identity(),
+        module_type: Literal["1d", "2d", "3d"] = "1d",
+        transpose: bool = False,
+        weight_init: Optional[Callable[[nn.Module], None]] = None,
+        init_weights: bool = True,
         *args,
         **kwargs,
     ):
@@ -91,23 +93,30 @@ class Conv1dEXT(ConvNets):
             device=device,
             dtype=dtype,
         )
+        match module_type.lower():
+            case "1d":
+                md = nn.Conv1d if not transpose else nn.ConvTranspose1d
+            case "2d":
+                md = nn.Conv2d if not transpose else nn.ConvTranspose2d
+            case "3d":
+                md = nn.Conv3d if not transpose else nn.ConvTranspose3d
+            case _:
+                raise ValueError(
+                    f"module_type {module_type} is not a valid module type! use '1d', '2d' or '3d'"
+                )
         if apply_norm is None:
-            self.cnn = nn.Conv1d(**cnn_kwargs)
-            self.has_wn = False
+            self.cnn = md(**cnn_kwargs)
         else:
-            self.has_wn = True
             if apply_norm == "spectral":
-                self.cnn = spectral_norm(nn.Conv1d(**cnn_kwargs))
+                self.cnn = spectral_norm(md(**cnn_kwargs))
             else:
-                self.cnn = weight_norm(nn.Conv1d(**cnn_kwargs))
+                self.cnn = weight_norm(md(**cnn_kwargs))
         self.actv_in = activation_in
         self.actv_out = activation_out
-        self.cnn.apply(self.init_weights)
+        if init_weights:
+            weight_init = default(weight_init, self.init_weights)
+            self.cnn.apply(weight_init)
     def forward(self, input: Tensor):
         return self.actv_out(self.cnn(self.actv_in(input)))
-    def remove_norms(self, name="weight"):
-        if self.has_wn:
-            remove_norm(self.cnn, name)
-            self.has_wn = False

lt_tensor/processors/audio.py CHANGED Viewed

@@ -10,6 +10,7 @@ from lt_utils.type_utils import is_file, is_array
 from lt_utils.file_ops import FileScan, get_file_name, path_to_str
 from torchaudio.functional import detect_pitch_frequency
 import torch.nn.functional as F
+from librosa.filters import mel as _mel_filter_bank
 DEFAULT_DEVICE = torch.tensor([0]).device
@@ -25,7 +26,7 @@ class AudioProcessorConfig(ModelConfig):
     f_min: float = 0
     f_max: Optional[float] = None
     center: bool = True
-    mel_scale: Literal["htk" "slaney"] = "htk"
+    mel_scale: Literal["htk", "slaney"] = "htk"
     std: int = 4
     mean: int = -4
     n_iter: int = 32
@@ -33,6 +34,7 @@ class AudioProcessorConfig(ModelConfig):
     normalized: bool = False
     onesided: Optional[bool] = None
     n_stft: int = None
+    mel_default: Literal["torch", "librosa"] = "librosa"
     def __init__(
         self,
@@ -49,6 +51,7 @@ class AudioProcessorConfig(ModelConfig):
         mean: int = -4,
         normalized: bool = False,
         onesided: Optional[bool] = None,
+        mel_default: Literal["torch", "librosa"] = "librosa",
         *args,
         **kwargs,
     ):
@@ -66,6 +69,7 @@ class AudioProcessorConfig(ModelConfig):
             "mean": mean,
             "normalized": normalized,
             "onesided": onesided,
+            "mel_default": mel_default,
         }
         super().__init__(**settings)
         self.post_process()
@@ -88,14 +92,10 @@ def _comp_rms_helper(i: int, audio: Tensor, mel: Optional[Tensor]):
 class AudioProcessor(Model):
-    def __init__(
-        self,
-        config: AudioProcessorConfig = AudioProcessorConfig(),
-        window: Optional[Tensor] = None,
-    ):
+    def __init__(self, config: AudioProcessorConfig = AudioProcessorConfig()):
         super().__init__()
         self.cfg = config
-        self._mel_spec = torchaudio.transforms.MelSpectrogram(
+        self._mel_spec_torch = torchaudio.transforms.MelSpectrogram(
             sample_rate=self.cfg.sample_rate,
             n_mels=self.cfg.n_mels,
             n_fft=self.cfg.n_fft,
@@ -107,6 +107,7 @@ class AudioProcessor(Model):
             mel_scale=self.cfg.mel_scale,
             normalized=self.cfg.normalized,
         )
         self._mel_rscale = torchaudio.transforms.InverseMelScale(
             n_stft=self.cfg.n_stft,
             n_mels=self.cfg.n_mels,
@@ -115,34 +116,121 @@ class AudioProcessor(Model):
             f_max=self.cfg.f_max,
             mel_scale=self.cfg.mel_scale,
         )
+        self.mel_lib_padding = (self.cfg.n_fft - self.cfg.hop_length) // 2
         self.register_buffer(
             "window",
-            (torch.hann_window(self.cfg.win_length) if window is None else window),
+            torch.hann_window(self.cfg.win_length),
         )
+        self.register_buffer(
+            "mel_filter_bank",
+            torch.from_numpy(
+                _mel_filter_bank(
+                    sr=self.cfg.sample_rate,
+                    n_fft=self.cfg.n_fft,
+                    n_mels=self.cfg.n_mels,
+                    fmin=self.cfg.f_min,
+                    fmax=self.cfg.f_max,
+                )
+            ).float(),
+        )
+    def spectral_norm(self, x: Tensor, c: int = 1, eps: float = 1e-5):
+        return torch.log(torch.clamp(x, min=eps) * c)
+    def spectral_de_norm(self, x: Tensor, c: int = 1):
+        return torch.exp(x) / c
+    def log_norm(
+        self,
+        entry: Tensor,
+        eps: float = 1e-5,
+        mean: Optional[Number] = None,
+        std: Optional[Number] = None,
+    ) -> Tensor:
+        mean = default(mean, self.cfg.mean)
+        std = default(std, self.cfg.std)
+        return (torch.log(eps + entry.unsqueeze(0)) - mean) / std
     def compute_mel(
         self,
         wave: Tensor,
-        eps: float = 1e-5,
-        raw_mel_only: bool = False,
-        *,
-        _recall: bool = False,
+        method: Optional[Literal["torch", "librosa"]] = None,
+        apply_norm: bool = False,
+        eps: Optional[float] = None,
+        **kwargs,
+    ) -> Tensor:
+        method = default(method, self.cfg.mel_default)
+        if method == "torch":
+            return self.compute_mel_torch(
+                wave,
+                log_norm=apply_norm,
+                eps=eps,
+                mean=kwargs.get("mean", None),
+                std=kwargs.get("std", None),
+            )
+        return self.compute_mel_librosa(
+            wave,
+            log_norm=apply_norm,
+            eps=eps,
+        )
+    def compute_mel_torch(
+        self,
+        wave: Tensor,
+        log_norm: bool = False,
+        eps: Optional[float] = None,
+        mean: Optional[Number] = None,
+        std: Optional[Number] = None,
+        *args,
+        **kwargs,
     ) -> Tensor:
         """Returns: (M, T) or (B, M, T) if batched"""
         try:
-            mel_tensor = self._mel_spec(wave.to(self.device))  # [M, T]
-            if not raw_mel_only:
-                mel_tensor = (
-                    torch.log(eps + mel_tensor.unsqueeze(0)) - self.cfg.mean
-                ) / self.cfg.std
-            return mel_tensor.squeeze()
+            mel_tensor = self._mel_spec_torch.forward(wave.to(self.device))  # [M, T]
         except RuntimeError as e:
-            if not _recall:
-                self._mel_spec.to(self.device)
-                return self.compute_mel(wave, raw_mel_only, eps, _recall=True)
-            raise e
+            mel_tensor = self._mel_spec_torch.forward(wave.to(self.device))  # [M, T]
+        if log_norm:
+            return self.log_norm(mel_tensor, eps, mean, std).squeeze()
+        return mel_tensor.squeeze()
+    def compute_mel_librosa(
+        self,
+        wave: Tensor,
+        eps: float = 1e-5,
+        spectral_norm: bool = False,
+        *args,
+        **kwargs,
+    ):
+        if wave.ndim == 1:
+            wave = wave.unsqueeze(0)
+        wave = torch.nn.functional.pad(
+            wave.unsqueeze(1),
+            (self.mel_lib_padding, self.mel_lib_padding),
+            mode="reflect",
+        ).squeeze(1)
+        spec = torch.stft(
+            wave,
+            self.cfg.n_fft,
+            hop_length=self.cfg.hop_length,
+            win_length=self.cfg.win_length,
+            window=self.window,
+            center=self.cfg.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-12)
+        try:
+            results = torch.matmul(self.mel_filter_bank, spec)
+        except RuntimeError:
+            self.mel_filter_bank = self.mel_filter_bank.to(self.device)
+            self.window = self.window.to(self.device)
+            results = torch.matmul(self.mel_filter_bank, spec)
+        if spectral_norm:
+            return self.spectral_norm(results, eps=eps).squeeze()
+        return results.squeeze()
     def compute_inverse_mel(self, melspec: Tensor, *, _recall=False):
         try:
@@ -382,7 +470,7 @@ class AudioProcessor(Model):
             antialias=antialias,
         )
-    def istft(
+    def istft_spec_phase(
         self,
         spec: Tensor,
         phase: Tensor,
@@ -394,34 +482,91 @@ class AudioProcessor(Model):
         normalized: Optional[bool] = None,
         onesided: Optional[bool] = None,
         return_complex: bool = False,
-        *,
-        _recall: bool = False,
     ):
-        if win_length is not None and win_length != self.cfg.win_length:
-            window = torch.hann_window(win_length, device=spec.device)
-        else:
-            window = self.window
+        """Util for models that needs to reconstruct the audio using inverse stft"""
+        window = (
+            torch.hann_window(win_length, device=spec.device)
+            if win_length is not None and win_length != self.cfg.win_length
+            else self.window.to(spec.device)
+        )
+        return torch.istft(
+            spec * torch.exp(phase * 1j),
+            n_fft=default(n_fft, self.cfg.n_fft),
+            hop_length=default(hop_length, self.cfg.hop_length),
+            win_length=default(win_length, self.cfg.win_length),
+            window=window,
+            center=center,
+            normalized=default(normalized, self.cfg.normalized),
+            onesided=default(onesided, self.cfg.onesided),
+            length=length,
+            return_complex=return_complex,
+        )
-        try:
-            return torch.istft(
-                spec * torch.exp(phase * 1j),
-                n_fft=default(n_fft, self.cfg.n_fft),
-                hop_length=default(hop_length, self.cfg.hop_length),
-                win_length=default(win_length, self.cfg.win_length),
-                window=window,
-                center=center,
-                normalized=default(normalized, self.cfg.normalized),
-                onesided=default(onesided, self.cfg.onesided),
-                length=length,
-                return_complex=return_complex,
-            )
-        except RuntimeError as e:
-            if not _recall and spec.device != self.window.device:
-                self.window = self.window.to(spec.device)
-                return self.istft(
-                    spec, phase, n_fft, hop_length, win_length, length, _recall=True
-                )
-            raise e
+    def istft(
+        self,
+        wave: Tensor,
+        n_fft: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        win_length: Optional[int] = None,
+        length: Optional[int] = None,
+        center: bool = True,
+        normalized: Optional[bool] = None,
+        onesided: Optional[bool] = None,
+        return_complex: bool = False,
+    ):
+        window = (
+            torch.hann_window(win_length, device=wave.device)
+            if win_length is not None and win_length != self.cfg.win_length
+            else self.window.to(wave.device)
+        )
+        if not torch.is_complex(wave):
+            wave = wave * 1j
+        return torch.istft(
+            wave,
+            n_fft=default(n_fft, self.cfg.n_fft),
+            hop_length=default(hop_length, self.cfg.hop_length),
+            win_length=default(win_length, self.cfg.win_length),
+            window=window,
+            center=center,
+            normalized=default(normalized, self.cfg.normalized),
+            onesided=default(onesided, self.cfg.onesided),
+            length=length,
+            return_complex=return_complex,
+        )
+    def stft(
+        self,
+        wave: Tensor,
+        center: bool = True,
+        n_fft: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        win_length: Optional[int] = None,
+        normalized: Optional[bool] = None,
+        onesided: Optional[bool] = None,
+        return_complex: bool = True,
+    ):
+        window = (
+            torch.hann_window(win_length, device=wave.device)
+            if win_length is not None and win_length != self.cfg.win_length
+            else self.window.to(wave.device)
+        )
+        results = torch.stft(
+            input=wave,
+            n_fft=default(n_fft, self.cfg.n_fft),
+            hop_length=default(hop_length, self.cfg.hop_length),
+            win_length=default(win_length, self.cfg.win_length),
+            window=window,
+            center=center,
+            pad_mode="reflect",
+            normalized=default(normalized, self.cfg.normalized),
+            onesided=default(onesided, self.cfg.onesided),
+            return_complex=True,  # always, then if we need a not complex type we use view as real.
+        )
+        if not return_complex:
+            return torch.view_as_real(results)
+        return results
     def istft_norm(
         self,
@@ -435,11 +580,11 @@ class AudioProcessor(Model):
         onesided: Optional[bool] = None,
         return_complex: bool = False,
     ):
-        if win_length is not None and win_length != self.cfg.win_length:
-            window = torch.hann_window(win_length, device=wave.device)
-        else:
-            window = self.window
+        window = (
+            torch.hann_window(win_length, device=wave.device)
+            if win_length is not None and win_length != self.cfg.win_length
+            else self.window.to(wave.device)
+        )
         spectrogram = torch.stft(
             input=wave,
             n_fft=default(n_fft, self.cfg.n_fft),
@@ -473,15 +618,15 @@ class AudioProcessor(Model):
     def load_audio(
         self,
         path: PathLike,
-        top_db: float = 30,
+        top_db: Optional[float] = None,
         normalize: bool = False,
+        mono: bool = True,
         *,
-        ref: float | Callable[[np.ndarray], Any] = np.max,
-        frame_length: int = 2048,
+        sample_rate: Optional[float] = None,
         hop_length: int = 512,
-        mono: bool = True,
-        offset: float = 0.0,
+        frame_length: int = 2048,
         duration: Optional[float] = None,
+        offset: float = 0.0,
         dtype: Any = np.float32,
         res_type: str = "soxr_hq",
         fix: bool = True,
@@ -491,29 +636,32 @@ class AudioProcessor(Model):
         norm_axis: int = 0,
         norm_threshold: Optional[float] = None,
         norm_fill: Optional[bool] = None,
+        ref: float | Callable[[np.ndarray], Any] = np.max,
     ) -> Tensor:
         is_file(path, True)
+        sample_rate = default(sample_rate, self.cfg.sample_rate)
         wave, sr = librosa.load(
             str(path),
-            sr=self.cfg.sample_rate,
+            sr=sample_rate,
             mono=mono,
             offset=offset,
             duration=duration,
             dtype=dtype,
             res_type=res_type,
         )
-        wave, _ = librosa.effects.trim(
-            wave,
-            top_db=top_db,
-            ref=ref,
-            frame_length=frame_length,
-            hop_length=hop_length,
-        )
-        if sr != self.cfg.sample_rate:
+        if top_db is not None:
+            wave, _ = librosa.effects.trim(
+                wave,
+                top_db=top_db,
+                ref=ref,
+                frame_length=frame_length,
+                hop_length=hop_length,
+            )
+        if sr != sample_rate:
             wave = librosa.resample(
                 wave,
                 orig_sr=sr,
-                target_sr=self.cfg.sample_rate,
+                target_sr=sample_rate,
                 res_type=res_type,
                 fix=fix,
                 scale=scale,
@@ -553,10 +701,6 @@ class AudioProcessor(Model):
             maximum,
         )
-    def stft_loss(self, signal: Tensor, ground: Tensor, magnitude: float = 1.0):
-        ground = F.interpolate(ground, signal.shape[-1]).to(signal.device)
-        return F.l1_loss(signal.squeeze(), ground.squeeze()) * magnitude
     def forward(
         self,
         *inputs: Union[Tensor, float],

{lt_tensor-0.0.1a36.dist-info → lt_tensor-0.0.1a38.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lt-tensor
-Version: 0.0.1a36
+Version: 0.0.1a38
 Summary: General utilities for PyTorch and others. Built for general use.
 Home-page: https://github.com/gr1336/lt-tensor/
 Author: gr1336
@@ -18,7 +18,7 @@ Requires-Dist: tokenizers
 Requires-Dist: pyyaml>=6.0.0
 Requires-Dist: numba>0.60.0
 Requires-Dist: lt-utils>=0.0.4
-Requires-Dist: librosa==0.11.*
+Requires-Dist: librosa<1,>=0.10.2.post1
 Requires-Dist: einops
 Requires-Dist: plotly
 Requires-Dist: scipy

{lt_tensor-0.0.1a36.dist-info → lt_tensor-0.0.1a38.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-lt_tensor/__init__.py,sha256=nBbiGH1byHU0aTTKKorRj8MIEO2oEMBXl7kt5DOCatU,441
+lt_tensor/__init__.py,sha256=2C0DCdesX13deU-nV8EbiF0poSsHWU0VuZvFcTZhJQk,441
 lt_tensor/config_templates.py,sha256=F9UvL8paAjkSvio890kp8WznpYeI50pYnm9iqQroBxk,2797
-lt_tensor/losses.py,sha256=Heco_WyoC1HkNkcJEircOAzS9umusATHiNAG-FKGyzc,8918
+lt_tensor/losses.py,sha256=e-YyKMmI0FwWQ3VLfJLDGSH4_rNpnYj0-htuk4eYboE,9283
 lt_tensor/lr_schedulers.py,sha256=6_vcfaPHrozfH3wvmNEdKSFYl6iTIijYoHL8vuG-45U,7651
 lt_tensor/math_ops.py,sha256=ahX6Z1Mt3X-FhmwSZYZea5mB1B0S8GDuvKPfAm5e_FQ,2646
 lt_tensor/misc_utils.py,sha256=stL6q3M7S2N4FBICFYbgYpdPDrJRlwmr24-iCXMRifM,28933
@@ -11,7 +11,7 @@ lt_tensor/torch_commons.py,sha256=8l0bxmrAzwvyqjivCIVISXlbvKarlg4DdE0BOGSnMuQ,81
 lt_tensor/transform.py,sha256=dZm8T_ov0blHMQu6nGiehsdG1VSB7bZBUVmTkT-PBdc,13257
 lt_tensor/model_zoo/__init__.py,sha256=yPUVchgVhU2nAJ2ocA4HFfG7IMEiBu8qOi8I1KWTTkU,404
 lt_tensor/model_zoo/basic.py,sha256=pI8HyiHK-cmWcEEaVY_EduUJOjZW6HOtXvJd8Rbhq30,15452
-lt_tensor/model_zoo/convs.py,sha256=Tws0jrPfs9m7OLmJ30W0AfkAvZgppW7lNi4xt0e-qRU,3518
+lt_tensor/model_zoo/convs.py,sha256=Ri_8BV2dho-KS57GS8xg6SJOKPkYEmJs4Rl_byofiSE,3995
 lt_tensor/model_zoo/features.py,sha256=DO8dlE0kmPKTNC1Xkv9wKegOOYkQa_rkxM4hhcNwJWA,15655
 lt_tensor/model_zoo/fusion.py,sha256=usC1bcjQRNivDc8xzkIS5T1glm78OLcs2V_tPqfp-eI,5422
 lt_tensor/model_zoo/pos_encoder.py,sha256=3d1EYLinCU9UAy-WuEWeYMGhMqaGknCiQ5qEmhw_UYM,4487
@@ -26,7 +26,7 @@ lt_tensor/model_zoo/activations/snake/__init__.py,sha256=AtOAbJuMinxmKkppITGMzRb
 lt_tensor/model_zoo/audio_models/__init__.py,sha256=WwiP9MekJreMOfKPWLl24VkRJIpLk6hhL8ch0aKgOss,103
 lt_tensor/model_zoo/audio_models/resblocks.py,sha256=u-foHxaFDUICjxSkpyHXljQYQG9zMxVYaOGqLR_nJ-k,7978
 lt_tensor/model_zoo/audio_models/bigvgan/__init__.py,sha256=4EZG8Non75dHoDCizMHbMTvPrKwdUlPYGHc7hkfT_nw,8526
-lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=PDuDYN1omD1RoAXcmxH3tEgfAuM3ZHAWzimD6ElMqEQ,9073
+lt_tensor/model_zoo/audio_models/diffwave/__init__.py,sha256=g9tSLjRgl7whafA9aunNna-n41Afdqz13EiaiHOHwo0,8249
 lt_tensor/model_zoo/audio_models/hifigan/__init__.py,sha256=ITSXHg3c0Um1P2HaPaXkQKI7meG5Ne60wTbyyYju3hY,6360
 lt_tensor/model_zoo/audio_models/istft/__init__.py,sha256=blICjLX_z_IFmR3_TCz_dJiSayLYGza9eG6fd9aKyvE,7448
 lt_tensor/model_zoo/losses/__init__.py,sha256=B9RAUxBiOZwooztnij1oLeRwZ7_MjnN3mPoum7saD6s,59
@@ -35,9 +35,9 @@ lt_tensor/model_zoo/losses/CQT/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
 lt_tensor/model_zoo/losses/CQT/transforms.py,sha256=Vkid0J9dqLnlINfyyUlQf-qB3gOQAgU7W9j7xLOjDFw,13218
 lt_tensor/model_zoo/losses/CQT/utils.py,sha256=twGw6FVD7V5Ksfx_1BUEN3EP1tAS6wo-9LL3VnuHB8c,16751
 lt_tensor/processors/__init__.py,sha256=Pvxhh0KR65zLCgUd53_k5Z0y5JWWcO0ZBXFK9rv0o5w,109
-lt_tensor/processors/audio.py,sha256=3YzyEpMwh124rb1KMAly62qweeruF200BnM-vQIbzy0,18645
-lt_tensor-0.0.1a36.dist-info/licenses/LICENSE,sha256=TbiyJWLgNqqgqhfCnrGwFIxy7EqGNrIZZcKhHrefcuU,11354
-lt_tensor-0.0.1a36.dist-info/METADATA,sha256=mTmnoWn8EG48j_VOM3rr_8RLLgaxB5pWZE1tkPdFrac,1062
-lt_tensor-0.0.1a36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-lt_tensor-0.0.1a36.dist-info/top_level.txt,sha256=35FuhFeXnUyvHWdbVHGPh0hS8euofafnJ_GJAVSF4Kk,10
-lt_tensor-0.0.1a36.dist-info/RECORD,,
+lt_tensor/processors/audio.py,sha256=QaEbzoCxl7zJNv6ELFwX6AO--8NuOGscgqxwNpV8Czw,23599
+lt_tensor-0.0.1a38.dist-info/licenses/LICENSE,sha256=TbiyJWLgNqqgqhfCnrGwFIxy7EqGNrIZZcKhHrefcuU,11354
+lt_tensor-0.0.1a38.dist-info/METADATA,sha256=g87aQm1aw-2dlCEvss9CcQ4iNl1Bi_mlqafGIeR1AdU,1071
+lt_tensor-0.0.1a38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+lt_tensor-0.0.1a38.dist-info/top_level.txt,sha256=35FuhFeXnUyvHWdbVHGPh0hS8euofafnJ_GJAVSF4Kk,10
+lt_tensor-0.0.1a38.dist-info/RECORD,,

{lt_tensor-0.0.1a36.dist-info → lt_tensor-0.0.1a38.dist-info}/WHEEL RENAMED Viewed

File without changes

{lt_tensor-0.0.1a36.dist-info → lt_tensor-0.0.1a38.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{lt_tensor-0.0.1a36.dist-info → lt_tensor-0.0.1a38.dist-info}/top_level.txt RENAMED Viewed

File without changes

lt-tensor 0.0.1a36__py3-none-any.whl → 0.0.1a38__py3-none-any.whl

lt-tensor 0.0.1a36py3-none-any.whl → 0.0.1a38py3-none-any.whl