PyPI - lt-tensor - Versions diffs - 0.0.1a39__tar.gz → 0.0.1a40__tar.gz - Mend

lt-tensor 0.0.1a39tar.gz → 0.0.1a40tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lt-tensor
-Version: 0.0.1a39
+Version: 0.0.1a40
 Summary: General utilities for PyTorch and others. Built for general use.
 Home-page: https://github.com/gr1336/lt-tensor/
 Author: gr1336

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.0.1a39"
+__version__ = "0.0.1a40"
 from . import (
     lr_schedulers,

lt_tensor-0.0.1a40/lt_tensor/model_zoo/audio_models/bemaganv2/__init__.py ADDED Viewed

@@ -0,0 +1,205 @@
+from lt_utils.common import *
+from lt_tensor.torch_commons import *
+from lt_tensor.model_zoo.convs import ConvNets
+from lt_tensor.config_templates import ModelConfig
+from lt_utils.file_ops import is_file, load_json
+from lt_tensor.model_zoo.audio_models.resblocks import ResBlock1, ResBlock2
+from lt_tensor.model_zoo.activations import snake, alias_free
+from lt_tensor.model_zoo.audio_models.resblocks import AMPBlock1, AMPBlock2, get_snake
+class BemaGANv2Config(ModelConfig):
+    # Training params
+    in_channels: int = 80
+    upsample_rates: List[Union[int, List[int]]] = [8, 8, 2, 2]
+    upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16, 4, 4]
+    upsample_initial_channel: int = 1536
+    resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11]
+    resblock_dilation_sizes: List[Union[int, List[int]]] = [
+        [1, 3, 5],
+        [1, 3, 5],
+        [1, 3, 5],
+    ]
+    activation: Literal["snake", "snakebeta"] = "snakebeta"
+    resblock_activation: Literal["snake", "snakebeta"] = "snakebeta"
+    resblock: int = 0
+    use_bias_at_final: bool = True
+    use_tanh_at_final: bool = True
+    snake_logscale: bool = True
+    def __init__(
+        self,
+        in_channels: int = 80,
+        upsample_rates: List[Union[int, List[int]]] = [8, 8, 2, 2],
+        upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16, 4, 4],
+        upsample_initial_channel: int = 1536,
+        resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
+        resblock_dilation_sizes: List[Union[int, List[int]]] = [
+            [1, 3, 5],
+            [1, 3, 5],
+            [1, 3, 5],
+        ],
+        activation: Literal["snake", "snakebeta"] = "snakebeta",
+        resblock_activation: Literal["snake", "snakebeta"] = "snakebeta",
+        resblock: Union[int, str] = "1",
+        use_bias_at_final: bool = False,
+        use_tanh_at_final: bool = False,
+        *args,
+        **kwargs,
+    ):
+        settings = {
+            "in_channels": in_channels,
+            "upsample_rates": upsample_rates,
+            "upsample_kernel_sizes": upsample_kernel_sizes,
+            "upsample_initial_channel": upsample_initial_channel,
+            "resblock_kernel_sizes": resblock_kernel_sizes,
+            "resblock_dilation_sizes": resblock_dilation_sizes,
+            "activation": activation,
+            "resblock_activation": resblock_activation,
+            "resblock": resblock,
+            "use_bias_at_final": use_bias_at_final,
+            "use_tanh_at_final": use_tanh_at_final,
+        }
+        super().__init__(**settings)
+    def post_process(self):
+        if isinstance(self.resblock, str):
+            self.resblock = 0 if self.resblock == "1" else 1
+class BemaGANv2Generator(ConvNets):
+    def __init__(
+        self, cfg: Union[BemaGANv2Config, Dict[str, object]] = BemaGANv2Config()
+    ):
+        super().__init__()
+        cfg = cfg if isinstance(cfg, BemaGANv2Config) else BemaGANv2Config(**cfg)
+        self.cfg = cfg
+        actv = get_snake(self.cfg.activation)
+        self.num_kernels = len(cfg.resblock_kernel_sizes)
+        self.num_upsamples = len(cfg.upsample_rates)
+        self.conv_pre = weight_norm(
+            nn.Conv1d(cfg.num_mels, cfg.upsample_initial_channel, 7, 1, padding=3)
+        )
+        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        resblock = AMPBlock1 if cfg.resblock == 0 else AMPBlock2
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(cfg.upsample_rates, cfg.upsample_kernel_sizes)):
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        weight_norm(
+                            nn.ConvTranspose1d(
+                                cfg.upsample_initial_channel // (2**i),
+                                cfg.upsample_initial_channel // (2 ** (i + 1)),
+                                k,
+                                u,
+                                padding=(k - u) // 2,
+                            )
+                        )
+                    ]
+                )
+            )
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = cfg.upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(cfg.resblock_kernel_sizes, cfg.resblock_dilation_sizes):
+                self.resblocks.append(
+                    resblock(
+                        ch,
+                        k,
+                        d,
+                        snake_logscale=cfg.snake_logscale,
+                        activation=cfg.resblock_activation,
+                    )
+                )
+        self.activation_post = actv(ch, alpha_logscale=cfg.snake_logscale)
+        # post conv
+        self.conv_post = weight_norm(
+            nn.Conv1d(ch, 1, 7, 1, padding=3, bias=self.cfg.use_bias_at_final)
+        )
+        self._use_tanh = self.cfg.use_tanh_at_final
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(self.init_weights)
+        self.conv_post.apply(self.init_weights)
+    def forward(self, x: Tensor):
+        # pre conv
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        if self._use_tanh:
+            return x.tanh()
+        return x
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_file: PathLike,
+        model_config: Union[
+            BemaGANv2Config, Dict[str, Any], PathLike
+        ] = BemaGANv2Config(),
+        *,
+        remove_norms: bool = False,
+        strict: bool = True,
+        map_location: str = "cpu",
+        weights_only: bool = False,
+        **kwargs,
+    ):
+        is_file(model_file, validate=True)
+        model_state_dict = torch.load(
+            model_file,
+            weights_only=weights_only,
+            map_location=map_location,
+        )
+        if isinstance(model_config, BemaGANv2Config):
+            h = model_config
+        elif isinstance(model_config, dict):
+            h = BemaGANv2Config(**model_config)
+        elif isinstance(model_config, (str, Path, bytes)):
+            h = BemaGANv2Config(
+                **load_json(model_config, BemaGANv2Config().state_dict())
+            )
+        model = cls(h)
+        if remove_norms:
+            model.remove_norms()
+        try:
+            model.load_state_dict(model_state_dict, strict=strict)
+            return model
+        except RuntimeError:
+            print(
+                f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!"
+            )
+            model.remove_norms()
+            model.load_state_dict(model_state_dict, strict=strict)
+        return model

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/model_zoo/audio_models/bigvgan/__init__.py RENAMED Viewed

@@ -2,9 +2,9 @@ from lt_utils.common import *
 from lt_tensor.torch_commons import *
 from lt_tensor.model_zoo.convs import ConvNets
 from lt_tensor.config_templates import ModelConfig
-from lt_tensor.model_zoo.activations import snake, alias_free
+from lt_tensor.model_zoo.activations import alias_free
 from lt_tensor.model_zoo.audio_models.resblocks import AMPBlock1, AMPBlock2, get_snake
-from lt_utils.file_ops import load_json, is_file, is_dir, is_path_valid
+from lt_utils.file_ops import load_json, is_file
 class BigVGANConfig(ModelConfig):
@@ -78,8 +78,9 @@ class BigVGAN(ConvNets):
     """
-    def __init__(self, cfg: BigVGANConfig):
+    def __init__(self, cfg: Union[BigVGANConfig, Dict[str, object]] = BigVGANConfig()):
         super().__init__()
+        cfg = cfg if isinstance(cfg, BigVGANConfig) else BigVGANConfig(**cfg)
         self.cfg = cfg
         actv = get_snake(self.cfg.activation)
@@ -173,46 +174,16 @@ class BigVGAN(ConvNets):
             return x.tanh()
         return x.clamp(min=-1.0, max=1.0)
-    def load_weights(
-        self,
-        path,
-        strict=False,
-        assign=False,
-        weights_only=False,
-        mmap=None,
-        raise_if_not_exists=False,
-        **pickle_load_args,
-    ):
-        try:
-            return super().load_weights(
-                path,
-                raise_if_not_exists,
-                strict,
-                assign,
-                weights_only,
-                mmap,
-                **pickle_load_args,
-            )
-        except RuntimeError:
-            self.remove_norms()
-            return super().load_weights(
-                path,
-                raise_if_not_exists,
-                strict,
-                assign,
-                weights_only,
-                mmap,
-                **pickle_load_args,
-            )
     @classmethod
     def from_pretrained(
         cls,
         model_file: PathLike,
-        model_config: Union[BigVGANConfig, Dict[str, Any]],
+        model_config: Union[
+            BigVGANConfig, Dict[str, Any], Dict[str, Any], PathLike
+        ] = BigVGANConfig(),
         *,
         remove_norms: bool = False,
-        strict: bool = False,
+        strict: bool = True,
         map_location: str = "cpu",
         weights_only: bool = False,
         **kwargs,
@@ -220,13 +191,17 @@ class BigVGAN(ConvNets):
         is_file(model_file, validate=True)
         model_state_dict = torch.load(
-            model_file, weights_only=weights_only, map_location=map_location
+            model_file,
+            weights_only=weights_only,
+            map_location=map_location,
         )
         if isinstance(model_config, BigVGANConfig):
             h = model_config
-        else:
+        elif isinstance(model_config, dict):
             h = BigVGANConfig(**model_config)
+        elif isinstance(model_config, (str, Path, bytes)):
+            h = BigVGANConfig(**load_json(model_config, BigVGANConfig().state_dict()))
         model = cls(h)
         if remove_norms:

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/model_zoo/audio_models/diffwave/__init__.py RENAMED Viewed

@@ -177,43 +177,44 @@ class ResidualBlock(Model):
 class DiffWave(Model):
-    def __init__(self, params: DiffWaveConfig = DiffWaveConfig()):
+    def __init__(self, cfg: Union[DiffWaveConfig, dict[str, object]] = DiffWaveConfig()):
         super().__init__()
-        self.params = params
-        self.n_hop = self.params.hop_samples
+        cfg = cfg if isinstance(cfg, DiffWaveConfig) else DiffWaveConfig(**cfg)
+        self.cfg = cfg
+        self.n_hop = self.cfg.hop_samples
         self.input_projection = ConvEXT(
             in_channels=1,
-            out_channels=params.residual_channels,
+            out_channels=cfg.residual_channels,
             kernel_size=1,
-            apply_norm=self.params.apply_norm,
+            apply_norm=self.cfg.apply_norm,
             activation_out=nn.LeakyReLU(0.1),
         )
-        self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
+        self.diffusion_embedding = DiffusionEmbedding(len(cfg.noise_schedule))
         self.spectrogram_upsampler = (
-            SpectrogramUpsampler() if not self.params.unconditional else None
+            SpectrogramUpsampler() if not self.cfg.unconditional else None
         )
         self.residual_layers = nn.ModuleList(
             [
                 ResidualBlock(
-                    params.n_mels,
-                    params.residual_channels,
-                    2 ** (i % params.dilation_cycle_length),
-                    uncond=params.unconditional,
-                    apply_norm=self.params.apply_norm_resblock,
+                    cfg.n_mels,
+                    cfg.residual_channels,
+                    2 ** (i % cfg.dilation_cycle_length),
+                    uncond=cfg.unconditional,
+                    apply_norm=self.cfg.apply_norm_resblock,
                 )
-                for i in range(params.residual_layers)
+                for i in range(cfg.residual_layers)
             ]
         )
         self.skip_projection = ConvEXT(
-            in_channels=params.residual_channels,
-            out_channels=params.residual_channels,
+            in_channels=cfg.residual_channels,
+            out_channels=cfg.residual_channels,
             kernel_size=1,
-            apply_norm=self.params.apply_norm,
+            apply_norm=self.cfg.apply_norm,
             activation_out=nn.LeakyReLU(0.1),
         )
         self.output_projection = ConvEXT(
-            params.residual_channels, 1, 1, apply_norm=self.params.apply_norm, init_weights=True,
+            cfg.residual_channels, 1, 1, apply_norm=self.cfg.apply_norm, init_weights=True,
         )
         self.activation = nn.LeakyReLU(0.1)
         self._res_d = sqrt(len(self.residual_layers))
@@ -224,7 +225,7 @@ class DiffWave(Model):
         diffusion_step: Tensor,
         spectrogram: Optional[Tensor] = None,
     ):
-        if not self.params.unconditional:
+        if not self.cfg.unconditional:
             assert spectrogram is not None
         if audio.ndim < 3:
             if audio.ndim == 2:
@@ -234,7 +235,7 @@ class DiffWave(Model):
         x = self.input_projection(audio)
         diffusion_step = self.diffusion_embedding(diffusion_step)
-        if not self.params.unconditional:  # use conditional model
+        if not self.cfg.unconditional:  # use conditional model
             spectrogram = self.spectrogram_upsampler(spectrogram)
         skip = torch.zeros_like(x, device=x.device)

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/model_zoo/audio_models/hifigan/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ from lt_utils.common import *
 from lt_tensor.torch_commons import *
 from lt_tensor.model_zoo.convs import ConvNets
 from lt_tensor.config_templates import ModelConfig
-from lt_utils.file_ops import is_file
+from lt_utils.file_ops import is_file, load_json
 from lt_tensor.model_zoo.audio_models.resblocks import ResBlock1, ResBlock2
@@ -16,11 +16,15 @@ def get_padding(kernel_size, dilation=1):
 class HifiganConfig(ModelConfig):
     # Training params
     in_channels: int = 80
-    upsample_rates: List[Union[int, List[int]]] = [8,8,2,2]
-    upsample_kernel_sizes: List[Union[int, List[int]]] = [16,16,4,4]
+    upsample_rates: List[Union[int, List[int]]] = [8, 8, 2, 2]
+    upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16, 4, 4]
     upsample_initial_channel: int = 512
     resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11]
-    resblock_dilation_sizes: List[Union[int, List[int]]] = [[1,3,5], [1,3,5], [1,3,5]]
+    resblock_dilation_sizes: List[Union[int, List[int]]] = [
+        [1, 3, 5],
+        [1, 3, 5],
+        [1, 3, 5],
+    ]
     activation: nn.Module = nn.LeakyReLU(0.1)
     resblock_activation: nn.Module = nn.LeakyReLU(0.1)
@@ -29,10 +33,10 @@ class HifiganConfig(ModelConfig):
     def __init__(
         self,
         in_channels: int = 80,
-        upsample_rates: List[Union[int, List[int]]] = [8,8,2,2],
-        upsample_kernel_sizes: List[Union[int, List[int]]] = [16,16,4,4],
+        upsample_rates: List[Union[int, List[int]]] = [8, 8, 2, 2],
+        upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16, 4, 4],
         upsample_initial_channel: int = 512,
-        resblock_kernel_sizes: List[Union[int, List[int]]] = [3,7,11],
+        resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
         resblock_dilation_sizes: List[Union[int, List[int]]] = [
             [1, 3, 5],
             [1, 3, 5],
@@ -63,9 +67,11 @@ class HifiganConfig(ModelConfig):
 class HifiganGenerator(ConvNets):
-    def __init__(self, cfg: HifiganConfig = HifiganConfig()):
+    def __init__(self, cfg: Union[HifiganConfig, Dict[str, object]] = HifiganConfig()):
         super().__init__()
+        cfg = cfg if isinstance(cfg, HifiganConfig) else HifiganConfig(**cfg)
         self.cfg = cfg
         self.num_kernels = len(cfg.resblock_kernel_sizes)
         self.num_upsamples = len(cfg.upsample_rates)
         self.conv_pre = weight_norm(
@@ -115,46 +121,16 @@ class HifiganGenerator(ConvNets):
         x = self.conv_post(self.activation(x))
         return x.tanh()
-    def load_weights(
-        self,
-        path,
-        strict=False,
-        assign=False,
-        weights_only=False,
-        mmap=None,
-        raise_if_not_exists=False,
-        **pickle_load_args,
-    ):
-        try:
-            return super().load_weights(
-                path,
-                raise_if_not_exists,
-                strict,
-                assign,
-                weights_only,
-                mmap,
-                **pickle_load_args,
-            )
-        except RuntimeError:
-            self.remove_norms()
-            return super().load_weights(
-                path,
-                raise_if_not_exists,
-                strict,
-                assign,
-                weights_only,
-                mmap,
-                **pickle_load_args,
-            )
     @classmethod
     def from_pretrained(
         cls,
         model_file: PathLike,
-        model_config: Union[HifiganConfig, Dict[str, Any]],
+        model_config: Union[
+            HifiganConfig, Dict[str, Any], Dict[str, Any], PathLike
+        ] = HifiganConfig(),
         *,
         remove_norms: bool = False,
-        strict: bool = False,
+        strict: bool = True,
         map_location: str = "cpu",
         weights_only: bool = False,
         **kwargs,
@@ -162,13 +138,17 @@ class HifiganGenerator(ConvNets):
         is_file(model_file, validate=True)
         model_state_dict = torch.load(
-            model_file, weights_only=weights_only, map_location=map_location
+            model_file,
+            weights_only=weights_only,
+            map_location=map_location,
         )
         if isinstance(model_config, HifiganConfig):
             h = model_config
-        else:
+        elif isinstance(model_config, dict):
             h = HifiganConfig(**model_config)
+        elif isinstance(model_config, (str, Path, bytes)):
+            h = HifiganConfig(**load_json(model_config, HifiganConfig().state_dict()))
         model = cls(h)
         if remove_norms:

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/model_zoo/audio_models/istft/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ from lt_utils.common import *
 from lt_tensor.torch_commons import *
 from lt_tensor.model_zoo.convs import ConvNets
 from lt_tensor.config_templates import ModelConfig
-from lt_utils.file_ops import is_file
+from lt_utils.file_ops import is_file, load_json
 from lt_tensor.model_zoo.audio_models.resblocks import ResBlock1, ResBlock2
@@ -67,8 +67,11 @@ class iSTFTNetConfig(ModelConfig):
 class iSTFTNetGenerator(ConvNets):
-    def __init__(self, cfg: iSTFTNetConfig = iSTFTNetConfig()):
+    def __init__(
+        self, cfg: Union[iSTFTNetConfig, Dict[str, object]] = iSTFTNetConfig()
+    ):
         super().__init__()
+        cfg = cfg if isinstance(cfg, iSTFTNetConfig) else iSTFTNetConfig(**cfg)
         self.cfg = cfg
         self.num_kernels = len(cfg.resblock_kernel_sizes)
         self.num_upsamples = len(cfg.upsample_rates)
@@ -146,46 +149,16 @@ class iSTFTNetGenerator(ConvNets):
         return spec, phase
-    def load_weights(
-        self,
-        path,
-        strict=False,
-        assign=False,
-        weights_only=False,
-        mmap=None,
-        raise_if_not_exists=False,
-        **pickle_load_args,
-    ):
-        try:
-            return super().load_weights(
-                path,
-                raise_if_not_exists,
-                strict,
-                assign,
-                weights_only,
-                mmap,
-                **pickle_load_args,
-            )
-        except RuntimeError:
-            self.remove_norms()
-            return super().load_weights(
-                path,
-                raise_if_not_exists,
-                strict,
-                assign,
-                weights_only,
-                mmap,
-                **pickle_load_args,
-            )
     @classmethod
     def from_pretrained(
         cls,
         model_file: PathLike,
-        model_config: Union[iSTFTNetConfig, Dict[str, Any]],
+        model_config: Union[
+            iSTFTNetConfig, Dict[str, Any], Dict[str, Any], PathLike
+        ] = iSTFTNetConfig(),
         *,
         remove_norms: bool = False,
-        strict: bool = False,
+        strict: bool = True,
         map_location: str = "cpu",
         weights_only: bool = False,
         **kwargs,
@@ -193,14 +166,17 @@ class iSTFTNetGenerator(ConvNets):
         is_file(model_file, validate=True)
         model_state_dict = torch.load(
-            model_file, weights_only=weights_only, map_location=map_location
+            model_file,
+            weights_only=weights_only,
+            map_location=map_location,
         )
         if isinstance(model_config, iSTFTNetConfig):
             h = model_config
-        else:
+        elif isinstance(model_config, dict):
             h = iSTFTNetConfig(**model_config)
+        elif isinstance(model_config, (str, Path, bytes)):
+            h = iSTFTNetConfig(**load_json(model_config, iSTFTNetConfig().state_dict()))
         model = cls(h)
         if remove_norms:
             model.remove_norms()

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/model_zoo/convs.py RENAMED Viewed

@@ -1,11 +1,7 @@
 __all__ = ["ConvNets", "ConvEXT"]
-import math
 from lt_utils.common import *
-import torch.nn.functional as F
 from lt_tensor.torch_commons import *
 from lt_tensor.model_base import Model
-from lt_tensor.misc_utils import log_tensor
-from lt_tensor.model_zoo.fusion import AdaFusion1D, AdaIN1D
 from lt_utils.misc_utils import default
@@ -52,6 +48,41 @@ class ConvNets(Model):
         if "Conv" in m.__class__.__name__:
             m.weight.data.normal_(mean, std)
+    def load_weights(
+        self,
+        path,
+        strict=False,
+        assign=False,
+        weights_only=False,
+        mmap=None,
+        raise_if_not_exists=False,
+        **pickle_load_args,
+    ):
+        try:
+            return super().load_weights(
+                path,
+                raise_if_not_exists,
+                strict,
+                assign,
+                weights_only,
+                mmap,
+                **pickle_load_args,
+            )
+        except RuntimeError as e:
+            try:
+                self.remove_norms()
+                return super().load_weights(
+                    path,
+                    raise_if_not_exists,
+                    strict,
+                    assign,
+                    weights_only,
+                    mmap,
+                    **pickle_load_args,
+                )
+            except:
+                raise e
 class ConvEXT(ConvNets):
     def __init__(

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/noise_tools.py RENAMED Viewed

@@ -13,6 +13,7 @@ __all__ = [
 ]
 from lt_utils.common import *
+from lt_tensor.model_base import Model
 import torch.nn.functional as F
 from lt_tensor.torch_commons import *
 import math
@@ -20,17 +21,17 @@ import random
 from lt_tensor.misc_utils import set_seed
-def add_gaussian_noise(x: Tensor, noise_level=0.025):
+def add_gaussian_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
     noise = torch.randn_like(x) * noise_level
     return x + noise
-def add_uniform_noise(x: Tensor, noise_level=0.025):
+def add_uniform_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
     noise = (torch.rand_like(x) - 0.5) * 2 * noise_level
     return x + noise
-def add_linear_noise(x, noise_level=0.05):
+def add_linear_noise(x, noise_level=0.05) -> Tensor:
     T = x.shape[-1]
     ramp = torch.linspace(0, noise_level, T, device=x.device)
     for _ in range(x.dim() - 1):
@@ -38,7 +39,7 @@ def add_linear_noise(x, noise_level=0.05):
     return x + ramp.expand_as(x)
-def add_impulse_noise(x: Tensor, noise_level=0.025):
+def add_impulse_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
     # For image inputs
     probs = torch.rand_like(x)
     x_clone = x.detach().clone()
@@ -47,7 +48,7 @@ def add_impulse_noise(x: Tensor, noise_level=0.025):
     return x_clone
-def add_pink_noise(x: Tensor, noise_level=0.05):
+def add_pink_noise(x: Tensor, noise_level: float = 0.05) -> Tensor:
     # pink noise: divide freq spectrum by sqrt(f)
     if x.ndim == 3:
         x = x.view(-1, x.shape[-1])  # flatten to 2D [B*M, T]
@@ -66,12 +67,12 @@ def add_pink_noise(x: Tensor, noise_level=0.05):
     return x + pink_noised * noise_level
-def add_clipped_gaussian_noise(x, noise_level=0.025):
+def add_clipped_gaussian_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
     noise = torch.randn_like(x) * noise_level
     return torch.clamp(x + noise, 0.0, 1.0)
-def add_multiplicative_noise(x, noise_level=0.025):
+def add_multiplicative_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
     noise = 1 + torch.randn_like(x) * noise_level
     return x * noise
@@ -109,7 +110,15 @@ _NOISE_DIM_SUPPORT = {
 def apply_noise(
     x: Tensor,
-    noise_type: str = "gaussian",
+    noise_type: Literal[
+        "gaussian",
+        "uniform",
+        "linear",
+        "impulse",
+        "pink",
+        "clipped_gaussian",
+        "multiplicative",
+    ] = "gaussian",
     noise_level: float = 0.01,
     seed: Optional[int] = None,
     on_error: Literal["raise", "try_others", "return_unchanged"] = "raise",
@@ -229,11 +238,11 @@ class NoiseSchedulerA(nn.Module):
         return collected, noise_history
-class NoiseSchedulerB(nn.Module):
-    def __init__(self, timesteps: int = 512):
+class NoiseSchedulerB(Model):
+    def __init__(self, timesteps: int = 50, l_min: float = 0.0005, l_max: float = 0.05):
         super().__init__()
-        betas = torch.linspace(1e-4, 0.02, timesteps)
+        betas = torch.linspace(l_min, l_max, timesteps)
         alphas = 1.0 - betas
         alpha_cumprod = torch.cumprod(alphas, dim=0)
@@ -272,7 +281,7 @@ class NoiseSchedulerB(nn.Module):
         self, x_0: Tensor, t: int, noise: Optional[Union[Tensor, float]] = None
     ) -> Tensor:
         assert (
-             0 <= t < self.timesteps
+            0 <= t < self.timesteps
         ), f"Time step t={t} is out of bounds for scheduler with {self.timesteps} steps."
         if noise is None:
@@ -286,7 +295,7 @@ class NoiseSchedulerB(nn.Module):
         return alpha_term + noise_term
-class NoiseSchedulerC(nn.Module):
+class NoiseSchedulerC(Model):
     def __init__(self, timesteps: int = 512):
         super().__init__()

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor/processors/audio.py RENAMED Viewed

@@ -92,9 +92,17 @@ def _comp_rms_helper(i: int, audio: Tensor, mel: Optional[Tensor]):
 class AudioProcessor(Model):
-    def __init__(self, config: AudioProcessorConfig = AudioProcessorConfig()):
+    def __init__(
+        self,
+        config: Union[AudioProcessorConfig, Dict[str, Any]] = AudioProcessorConfig(),
+    ):
         super().__init__()
-        self.cfg = config
+        assert isinstance(config, (AudioProcessorConfig, dict))
+        self.cfg = (
+            config
+            if isinstance(config, AudioProcessorConfig)
+            else AudioProcessorConfig(**config)
+        )
         self._mel_spec_torch = torchaudio.transforms.MelSpectrogram(
             sample_rate=self.cfg.sample_rate,
             n_mels=self.cfg.n_mels,
@@ -108,14 +116,6 @@ class AudioProcessor(Model):
             normalized=self.cfg.normalized,
         )
-        self._mel_rscale = torchaudio.transforms.InverseMelScale(
-            n_stft=self.cfg.n_stft,
-            n_mels=self.cfg.n_mels,
-            sample_rate=self.cfg.sample_rate,
-            f_min=self.cfg.f_min,
-            f_max=self.cfg.f_max,
-            mel_scale=self.cfg.mel_scale,
-        )
         self.mel_lib_padding = (self.cfg.n_fft - self.cfg.hop_length) // 2
         self.register_buffer(
             "window",
@@ -134,10 +134,10 @@ class AudioProcessor(Model):
             ).float(),
         )
-    def spectral_norm(self, x: Tensor, c: int = 1, eps: float = 1e-5):
+    def spectral_norm(self, x: Tensor, c: int = 1, eps: float = 1e-5) -> Tensor:
         return torch.log(torch.clamp(x, min=eps) * c)
-    def spectral_de_norm(self, x: Tensor, c: int = 1):
+    def spectral_de_norm(self, x: Tensor, c: int = 1) -> Tensor:
         return torch.exp(x) / c
     def log_norm(
@@ -201,7 +201,7 @@ class AudioProcessor(Model):
         spectral_norm: bool = False,
         *args,
         **kwargs,
-    ):
+    ) -> Tensor:
         if wave.ndim == 1:
             wave = wave.unsqueeze(0)
         wave = torch.nn.functional.pad(
@@ -232,15 +232,6 @@ class AudioProcessor(Model):
             return self.spectral_norm(results, eps=eps).squeeze()
         return results.squeeze()
-    def compute_inverse_mel(self, melspec: Tensor, *, _recall=False):
-        try:
-            return self._mel_rscale.forward(melspec.to(self.device)).squeeze()
-        except RuntimeError as e:
-            if not _recall:
-                self._mel_rscale.to(self.device)
-                return self.compute_inverse_mel(melspec, _recall=True)
-            raise e
     def compute_rms(
         self,
         audio: Optional[Union[Tensor, np.ndarray]] = None,
@@ -248,7 +239,7 @@ class AudioProcessor(Model):
         frame_length: Optional[int] = None,
         hop_length: Optional[int] = None,
         center: Optional[int] = None,
-    ):
+    ) -> Tensor:
         assert any([audio is not None, mel is not None])
         rms_kwargs = dict(
             frame_length=default(frame_length, self.cfg.n_fft),
@@ -297,7 +288,7 @@ class AudioProcessor(Model):
         audio: torch.Tensor,
         sample_rate: Optional[int] = None,
         n_steps: float = 2.0,
-    ):
+    ) -> Tensor:
         """
         Shifts the pitch of an audio tensor by `n_steps` semitones.
@@ -327,24 +318,19 @@ class AudioProcessor(Model):
             device=src_device, dtype=src_dtype
         )
-    @staticmethod
-    def calc_pitch_fmin(sr: int, frame_length: float):
-        """For pitch f_min"""
-        return (sr / (frame_length - 1)) * 2
     def compute_pitch(
         self,
         audio: Tensor,
         *,
         pad_mode: str = "constant",
         trough_threshold: float = 0.1,
-        fmin: Optional[float] = None,
-        fmax: Optional[float] = None,
+        fmin: float = librosa.note_to_hz("C2"),
+        fmax: float = librosa.note_to_hz("C7"),
         sr: Optional[float] = None,
         frame_length: Optional[int] = None,
         hop_length: Optional[int] = None,
         center: Optional[bool] = None,
-    ):
+    ) -> Tensor:
         default_dtype = audio.dtype
         default_device = audio.device
         if audio.ndim > 1:
@@ -353,10 +339,7 @@ class AudioProcessor(Model):
             B = 1
         sr = default(sr, self.cfg.sample_rate)
         frame_length = default(frame_length, self.cfg.n_fft)
-        fmin = max(
-            default(fmin, self.cfg.default_f_min), self.calc_pitch_fmin(sr, frame_length)
-        )
-        fmax = min(max(default(fmax, self.cfg.default_f_max), fmin + 1), sr // 2)
+        fmax = min(max(fmax, fmin + 1), sr // 2)
         hop_length = default(hop_length, self.cfg.hop_length)
         center = default(center, self.cfg.center)
         yn_kwargs = dict(
@@ -391,8 +374,8 @@ class AudioProcessor(Model):
         sr: Optional[float] = None,
         win_length: Optional[Number] = None,
         frame_length: Optional[Number] = None,
-    ):
-        sr = default(sr, self.sample_rate)
+    ) -> Tensor:
+        sr = default(sr, self.cfg.sample_rate)
         win_length = default(win_length, self.cfg.win_length)
         frame_length = default(frame_length, self.cfg.n_fft)
         fmin = default(fmin, self.calc_pitch_fmin(sr, frame_length))
@@ -411,7 +394,7 @@ class AudioProcessor(Model):
         array: np.ndarray,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
-    ):
+    ) -> Tensor:
         converted = torch.from_numpy(array)
         if device is None:
             device = self.device
@@ -422,13 +405,13 @@ class AudioProcessor(Model):
         arrays: List[np.ndarray],
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
-    ):
+    ) -> Tensor:
         stacked = torch.stack([torch.from_numpy(x) for x in arrays])
         if device is None:
             device = self.device
         return stacked.to(device=device, dtype=dtype)
-    def to_numpy_safe(self, tensor: Union[Tensor, np.ndarray]):
+    def to_numpy_safe(self, tensor: Union[Tensor, np.ndarray]) -> np.ndarray:
         if isinstance(tensor, np.ndarray):
             return tensor
         return tensor.detach().to(DEFAULT_DEVICE).numpy(force=True)
@@ -450,7 +433,7 @@ class AudioProcessor(Model):
         scale_factor: Optional[list[float]] = None,
         recompute_scale_factor: Optional[bool] = None,
         antialias: bool = False,
-    ):
+    ) -> Tensor:
         """
         The modes available for upsampling are: `nearest`, `linear` (3D-only),
         `bilinear`, `bicubic` (4D-only), `trilinear` (5D-only)
@@ -482,7 +465,7 @@ class AudioProcessor(Model):
         normalized: Optional[bool] = None,
         onesided: Optional[bool] = None,
         return_complex: bool = False,
-    ):
+    ) -> Tensor:
         """Util for models that needs to reconstruct the audio using inverse stft"""
         window = (
             torch.hann_window(win_length, device=spec.device)
@@ -513,7 +496,7 @@ class AudioProcessor(Model):
         normalized: Optional[bool] = None,
         onesided: Optional[bool] = None,
         return_complex: bool = False,
-    ):
+    ) -> Tensor:
         window = (
             torch.hann_window(win_length, device=wave.device)
             if win_length is not None and win_length != self.cfg.win_length
@@ -544,7 +527,7 @@ class AudioProcessor(Model):
         normalized: Optional[bool] = None,
         onesided: Optional[bool] = None,
         return_complex: bool = True,
-    ):
+    ) -> Tensor:
         window = (
             torch.hann_window(win_length, device=wave.device)
@@ -579,7 +562,7 @@ class AudioProcessor(Model):
         normalized: Optional[bool] = None,
         onesided: Optional[bool] = None,
         return_complex: bool = False,
-    ):
+    ) -> Tensor:
         window = (
             torch.hann_window(win_length, device=wave.device)
             if win_length is not None and win_length != self.cfg.win_length
@@ -619,12 +602,11 @@ class AudioProcessor(Model):
         self,
         path: PathLike,
         top_db: Optional[float] = None,
-        normalize: bool = False,
         mono: bool = True,
+        istft_norm: bool = True,
+        lib_norm: bool = False,
         *,
         sample_rate: Optional[float] = None,
-        hop_length: int = 512,
-        frame_length: int = 2048,
         duration: Optional[float] = None,
         offset: float = 0.0,
         dtype: Any = np.float32,
@@ -649,14 +631,6 @@ class AudioProcessor(Model):
             dtype=dtype,
             res_type=res_type,
         )
-        if top_db is not None:
-            wave, _ = librosa.effects.trim(
-                wave,
-                top_db=top_db,
-                ref=ref,
-                frame_length=frame_length,
-                hop_length=hop_length,
-            )
         if sr != sample_rate:
             wave = librosa.resample(
                 wave,
@@ -667,7 +641,9 @@ class AudioProcessor(Model):
                 scale=scale,
                 axis=axis,
             )
-        if normalize:
+        if top_db is not None:
+            wave, _ = librosa.effects.trim(wave, top_db=top_db)
+        if lib_norm:
             wave = librosa.util.normalize(
                 wave,
                 norm=norm,
@@ -675,6 +651,9 @@ class AudioProcessor(Model):
                 threshold=norm_threshold,
                 fill=norm_fill,
             )
+        results = torch.from_numpy(wave).float().unsqueeze(0).to(self.device)
+        if istft_norm:
+            results = self.istft_norm(results)
         return torch.from_numpy(wave).float().unsqueeze(0).to(self.device)
     def find_audios(
@@ -701,9 +680,84 @@ class AudioProcessor(Model):
             maximum,
         )
+    def audio_to_half(self, audio: Tensor):
+        audio = self.to_numpy_safe(audio)
+        data: np.ndarray = audio / np.abs(audio).max()
+        data = (data * 32767.0).astype(np.int16)
+        return self.from_numpy(data, dtype=torch.float16)
     def forward(
         self,
-        *inputs: Union[Tensor, float],
-        **inputs_kwargs,
+        x: Union[str, Path, Tensor],
+        *,
+        spectral_norm: bool = False,
+        add_batch_to_all: bool = False,
+        wave_batch_dim: bool = False,
+        mel_batch_dim: bool = False,
+        pitch_batch_dim: bool = False,
+        rms_batch_dim: bool = False,
+        spec_phase_batch_dim: bool = False,
     ):
-        return self.compute_mel(*inputs, **inputs_kwargs)
+        results = {
+            "wave": None,
+            "mel": None,
+            "pitch": None,
+            "rms": None,
+            "spec": None,
+            "phase": None,
+        }
+        results["wave"] = (
+            x.squeeze()
+            if isinstance(x, Tensor)
+            else self.load_audio(x, istft_norm=True).squeeze()
+        )
+        results["mel"] = self.compute_mel_librosa(
+            wave=(
+                results["wave"]
+                if results["wave"].ndim == 3
+                else results["wave"].unsqueeze(0)
+            ),
+            spectral_norm=spectral_norm,
+        ).squeeze()
+        try:
+            results["pitch"] = self.compute_pitch(results["wave"]).squeeze()
+        except Exception as e:
+            results["pitch"] = e
+        try:
+            results["rms"] = self.compute_rms(results["wave"], results["mel"]).squeeze()
+        except Exception as e:
+            results["rms"] = e
+        try:
+            sp_ph = self.stft(results["wave"], return_complex=False)
+            spec, phase = sp_ph.split(1, -1)
+            results["spec"] = spec.squeeze()
+            results["phase"] = phase.squeeze()
+        except Exception as e:
+            results["spec"] = e
+            results["phase"] = e
+        if (add_batch_to_all or wave_batch_dim) and results["wave"].ndim == 1:
+            results["wave"] = results["wave"].unsqueeze(0)
+        if (add_batch_to_all or mel_batch_dim) and results["mel"].ndim == 2:
+            results["mel"] = results["mel"].unsqueeze(0)
+        if (
+            isinstance(results["rms"], Tensor)
+            and (add_batch_to_all or rms_batch_dim)
+            and results["rms"].ndim == 1
+        ):
+            results["rms"] = results["rms"].unsqueeze(0)
+        if (
+            isinstance(results["pitch"], Tensor)
+            and (add_batch_to_all or pitch_batch_dim)
+            and results["pitch"].ndim == 1
+        ):
+            results["pitch"] = results["pitch"].unsqueeze(0)
+        if (
+            isinstance(results["spec"], Tensor)
+            and (add_batch_to_all or spec_phase_batch_dim)
+            and results["spec"].ndim == 2
+        ):
+            results["spec"] = results["spec"].unsqueeze(0)
+            results["phase"] = results["phase"].unsqueeze(0)
+        return results

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lt-tensor
-Version: 0.0.1a39
+Version: 0.0.1a40
 Summary: General utilities for PyTorch and others. Built for general use.
 Home-page: https://github.com/gr1336/lt-tensor/
 Author: gr1336

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/lt_tensor.egg-info/SOURCES.txt RENAMED Viewed

@@ -33,6 +33,7 @@ lt_tensor/model_zoo/activations/alias_free/resample.py
 lt_tensor/model_zoo/activations/snake/__init__.py
 lt_tensor/model_zoo/audio_models/__init__.py
 lt_tensor/model_zoo/audio_models/resblocks.py
+lt_tensor/model_zoo/audio_models/bemaganv2/__init__.py
 lt_tensor/model_zoo/audio_models/bigvgan/__init__.py
 lt_tensor/model_zoo/audio_models/diffwave/__init__.py
 lt_tensor/model_zoo/audio_models/hifigan/__init__.py

{lt_tensor-0.0.1a39 → lt_tensor-0.0.1a40}/setup.py RENAMED Viewed

@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as f:
     long_description = f.read()
 setup(
-    version="0.0.1a39",
+    version="0.0.1a40",
     name="lt-tensor",
     description="General utilities for PyTorch and others. Built for general use.",
     long_description=long_description,