lt-tensor 0.0.1a37__tar.gz → 0.0.1a39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/PKG-INFO +1 -1
  2. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/__init__.py +1 -1
  3. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/losses.py +10 -4
  4. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/diffwave/__init__.py +68 -81
  5. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/convs.py +25 -16
  6. lt_tensor-0.0.1a39/lt_tensor/model_zoo/losses/_envelope_disc/__init__.py +116 -0
  7. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/losses/discriminators.py +34 -64
  8. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/processors/audio.py +4 -2
  9. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/PKG-INFO +1 -1
  10. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/SOURCES.txt +1 -0
  11. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/setup.py +1 -1
  12. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/LICENSE +0 -0
  13. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/README.md +0 -0
  14. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/config_templates.py +0 -0
  15. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/lr_schedulers.py +0 -0
  16. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/math_ops.py +0 -0
  17. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/misc_utils.py +0 -0
  18. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_base.py +0 -0
  19. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/__init__.py +0 -0
  20. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/activations/__init__.py +0 -0
  21. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/activations/alias_free/__init__.py +0 -0
  22. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/activations/alias_free/act.py +0 -0
  23. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/activations/alias_free/filter.py +0 -0
  24. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/activations/alias_free/resample.py +0 -0
  25. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/activations/snake/__init__.py +0 -0
  26. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/__init__.py +0 -0
  27. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/bigvgan/__init__.py +0 -0
  28. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/hifigan/__init__.py +0 -0
  29. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/istft/__init__.py +0 -0
  30. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/audio_models/resblocks.py +0 -0
  31. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/basic.py +0 -0
  32. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/features.py +0 -0
  33. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/fusion.py +0 -0
  34. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/losses/CQT/__init__.py +0 -0
  35. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/losses/CQT/transforms.py +0 -0
  36. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/losses/CQT/utils.py +0 -0
  37. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/losses/__init__.py +0 -0
  38. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/pos_encoder.py +0 -0
  39. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/residual.py +0 -0
  40. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/model_zoo/transformer.py +0 -0
  41. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/monotonic_align.py +0 -0
  42. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/noise_tools.py +0 -0
  43. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/processors/__init__.py +0 -0
  44. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/torch_commons.py +0 -0
  45. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor/transform.py +0 -0
  46. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/dependency_links.txt +0 -0
  47. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/requires.txt +0 -0
  48. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/lt_tensor.egg-info/top_level.txt +0 -0
  49. {lt_tensor-0.0.1a37 → lt_tensor-0.0.1a39}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lt-tensor
3
- Version: 0.0.1a37
3
+ Version: 0.0.1a39
4
4
  Summary: General utilities for PyTorch and others. Built for general use.
5
5
  Home-page: https://github.com/gr1336/lt-tensor/
6
6
  Author: gr1336
@@ -1,4 +1,4 @@
1
- __version__ = "0.0.1a37"
1
+ __version__ = "0.0.1a39"
2
2
 
3
3
  from . import (
4
4
  lr_schedulers,
@@ -133,7 +133,7 @@ class MultiMelScaleLoss(Model):
133
133
  loss_mel_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
134
134
  loss_pitch_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
135
135
  loss_rms_fn: Callable[[Tensor, Tensor], Tensor] = nn.L1Loss(),
136
- center: bool = True,
136
+ center: bool = False,
137
137
  power: float = 1.0,
138
138
  normalized: bool = False,
139
139
  pad_mode: str = "reflect",
@@ -149,6 +149,7 @@ class MultiMelScaleLoss(Model):
149
149
  lambda_rms: float = 1.0,
150
150
  lambda_pitch: float = 1.0,
151
151
  weight: float = 1.0,
152
+ mel: Literal["librosa", "torch"] = "torch",
152
153
  ):
153
154
  super().__init__()
154
155
  assert (
@@ -188,6 +189,7 @@ class MultiMelScaleLoss(Model):
188
189
  onesided,
189
190
  std,
190
191
  mean,
192
+ mel,
191
193
  )
192
194
 
193
195
  def _setup_mels(
@@ -206,6 +208,7 @@ class MultiMelScaleLoss(Model):
206
208
  onesided: Optional[bool],
207
209
  std: int,
208
210
  mean: int,
211
+ mel: str,
209
212
  ):
210
213
  assert (
211
214
  len(n_mels)
@@ -224,6 +227,7 @@ class MultiMelScaleLoss(Model):
224
227
  pad_mode=pad_mode,
225
228
  std=std,
226
229
  mean=mean,
230
+ mel_default=mel,
227
231
  )
228
232
  self.mel_spectrograms: List[AudioProcessor] = nn.ModuleList(
229
233
  [
@@ -247,12 +251,14 @@ class MultiMelScaleLoss(Model):
247
251
  def forward(
248
252
  self, input_wave: torch.Tensor, target_wave: torch.Tensor
249
253
  ) -> torch.Tensor:
250
- assert self.use_istft_norm or input_wave.shape[-1] == target_wave.shape[-1]
254
+ assert self.use_istft_norm or input_wave.shape[-1] == target_wave.shape[-1], (
255
+ f"Size mismatch! input_wave {input_wave.shape[-1]} must match target_wave: {target_wave.shape[-1]}. "
256
+ "Alternatively 'use_istft_norm' can be set to Trie with will automatically force the audio to that size."
257
+ )
251
258
  target_wave = target_wave.to(input_wave.device)
252
259
  losses = 0.0
253
260
  for M in self.mel_spectrograms:
254
- # Apply normalization if requested
255
- if self.use_istft_norm:
261
+ if self.use_istft_norm and input_proc.shape[-1] != target_proc.shape[-1]:
256
262
  input_proc = M.istft_norm(input_wave, length=target_wave.shape[-1])
257
263
  target_proc = M.istft_norm(target_wave, length=target_wave.shape[-1])
258
264
  else:
@@ -1,14 +1,15 @@
1
- __all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsample", "DiffusionEmbedding"]
1
+ __all__ = ["DiffWave", "DiffWaveConfig", "SpectrogramUpsampler", "DiffusionEmbedding"]
2
2
 
3
3
  import numpy as np
4
4
  from lt_tensor.torch_commons import *
5
5
  from torch.nn import functional as F
6
6
  from lt_tensor.config_templates import ModelConfig
7
7
  from lt_tensor.torch_commons import *
8
- from lt_tensor.model_zoo.convs import ConvNets, Conv1dEXT
8
+ from lt_tensor.model_zoo.convs import ConvNets, ConvEXT
9
9
  from lt_tensor.model_base import Model
10
10
  from math import sqrt
11
11
  from lt_utils.common import *
12
+ from lt_tensor.misc_utils import log_tensor
12
13
 
13
14
 
14
15
  class DiffWaveConfig(ModelConfig):
@@ -21,12 +22,8 @@ class DiffWaveConfig(ModelConfig):
21
22
  unconditional = False
22
23
  apply_norm: Optional[Literal["weight", "spectral"]] = None
23
24
  apply_norm_resblock: Optional[Literal["weight", "spectral"]] = None
24
- noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist()
25
+ noise_schedule: list[int] = np.linspace(1e-4, 0.05, 25).tolist()
25
26
  # settings for auto-fixes
26
- interpolate = False
27
- interpolation_mode: Literal[
28
- "nearest", "linear", "bilinear", "bicubic", "trilinear", "area", "nearest-exact"
29
- ] = "nearest"
30
27
 
31
28
  def __init__(
32
29
  self,
@@ -37,16 +34,6 @@ class DiffWaveConfig(ModelConfig):
37
34
  dilation_cycle_length=10,
38
35
  unconditional=False,
39
36
  noise_schedule: list[int] = np.linspace(1e-4, 0.05, 50).tolist(),
40
- interpolate_cond=False,
41
- interpolation_mode: Literal[
42
- "nearest",
43
- "linear",
44
- "bilinear",
45
- "bicubic",
46
- "trilinear",
47
- "area",
48
- "nearest-exact",
49
- ] = "nearest",
50
37
  apply_norm: Optional[Literal["weight", "spectral"]] = None,
51
38
  apply_norm_resblock: Optional[Literal["weight", "spectral"]] = None,
52
39
  ):
@@ -58,8 +45,6 @@ class DiffWaveConfig(ModelConfig):
58
45
  "residual_channels": residual_channels,
59
46
  "unconditional": unconditional,
60
47
  "noise_schedule": noise_schedule,
61
- "interpolate": interpolate_cond,
62
- "interpolation_mode": interpolation_mode,
63
48
  "apply_norm": apply_norm,
64
49
  "apply_norm_resblock": apply_norm_resblock,
65
50
  }
@@ -102,19 +87,34 @@ class DiffusionEmbedding(Model):
102
87
  return table
103
88
 
104
89
 
105
- class SpectrogramUpsample(Model):
90
+ class SpectrogramUpsampler(Model):
106
91
  def __init__(self):
107
92
  super().__init__()
108
- self.conv1 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
109
- self.conv2 = nn.ConvTranspose2d(1, 1, [3, 32], stride=[1, 16], padding=[1, 8])
110
- self.activation = nn.LeakyReLU(0.4)
93
+ self.conv_net = nn.Sequential(
94
+ ConvEXT(
95
+ 1,
96
+ 1,
97
+ [3, 32],
98
+ stride=[1, 16],
99
+ padding=[1, 8],
100
+ module_type="2d",
101
+ transpose=True,
102
+ ),
103
+ nn.LeakyReLU(0.1),
104
+ ConvEXT(
105
+ 1,
106
+ 1,
107
+ [3, 32],
108
+ stride=[1, 16],
109
+ padding=[1, 8],
110
+ module_type="2d",
111
+ transpose=True,
112
+ ),
113
+ nn.LeakyReLU(0.1),
114
+ )
111
115
 
112
- def forward(self, x):
113
- x = torch.unsqueeze(x, 1)
114
- x = self.activation(self.conv1(x))
115
- x = self.activation(self.conv2(x))
116
- x = torch.squeeze(x, 1)
117
- return x
116
+ def forward(self, x: Tensor):
117
+ return self.conv_net(x.unsqueeze(0)).squeeze(1)
118
118
 
119
119
 
120
120
  class ResidualBlock(Model):
@@ -133,7 +133,7 @@ class ResidualBlock(Model):
133
133
  :param uncond: disable spectrogram conditional
134
134
  """
135
135
  super().__init__()
136
- self.dilated_conv = Conv1dEXT(
136
+ self.dilated_conv = ConvEXT(
137
137
  residual_channels,
138
138
  2 * residual_channels,
139
139
  3,
@@ -142,18 +142,18 @@ class ResidualBlock(Model):
142
142
  apply_norm=apply_norm,
143
143
  )
144
144
  self.diffusion_projection = nn.Linear(512, residual_channels)
145
- if not uncond: # conditional model
146
- self.conditioner_projection = Conv1dEXT(
145
+ self.uncoditional = uncond
146
+ self.conditioner_projection = None
147
+ if not uncond:
148
+ self.conditioner_projection = ConvEXT(
147
149
  n_mels,
148
150
  2 * residual_channels,
149
151
  1,
150
152
  apply_norm=apply_norm,
151
153
  )
152
- else: # unconditional model
153
- self.conditioner_projection = None
154
154
 
155
- self.output_projection = Conv1dEXT(
156
- residual_channels, 2 * residual_channels, 1, apply_norm == apply_norm
155
+ self.output_projection = ConvEXT(
156
+ residual_channels, 2 * residual_channels, 1, apply_norm=apply_norm
157
157
  )
158
158
 
159
159
  def forward(
@@ -164,20 +164,15 @@ class ResidualBlock(Model):
164
164
  ):
165
165
 
166
166
  diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
167
- y = x + diffusion_step
168
- if (
169
- conditioner is None or self.conditioner_projection is None
170
- ): # using a unconditional model
171
- y = self.dilated_conv(y)
172
- else:
173
- conditioner = self.conditioner_projection(conditioner)
174
- y = self.dilated_conv(y) + conditioner
175
-
176
- gate, filter = torch.chunk(y, 2, dim=1)
177
- y = torch.sigmoid(gate) * torch.tanh(filter)
167
+ y = (x + diffusion_step).squeeze(1)
168
+ y = self.dilated_conv(y)
169
+ if not self.uncoditional and conditioner is not None:
170
+ y = y + self.conditioner_projection(conditioner)
178
171
 
172
+ gate, _filter = y.chunk(2, dim=1)
173
+ y = gate.sigmoid() * _filter.tanh()
179
174
  y = self.output_projection(y)
180
- residual, skip = torch.chunk(y, 2, dim=1)
175
+ residual, skip = y.chunk(2, dim=1)
181
176
  return (x + residual) / sqrt(2.0), skip
182
177
 
183
178
 
@@ -186,19 +181,17 @@ class DiffWave(Model):
186
181
  super().__init__()
187
182
  self.params = params
188
183
  self.n_hop = self.params.hop_samples
189
- self.interpolate = self.params.interpolate
190
- self.interpolate_mode = self.params.interpolation_mode
191
- self.input_projection = Conv1dEXT(
184
+ self.input_projection = ConvEXT(
192
185
  in_channels=1,
193
186
  out_channels=params.residual_channels,
194
187
  kernel_size=1,
195
188
  apply_norm=self.params.apply_norm,
189
+ activation_out=nn.LeakyReLU(0.1),
196
190
  )
197
191
  self.diffusion_embedding = DiffusionEmbedding(len(params.noise_schedule))
198
- if self.params.unconditional: # use unconditional model
199
- self.spectrogram_upsample = None
200
- else:
201
- self.spectrogram_upsample = SpectrogramUpsample()
192
+ self.spectrogram_upsampler = (
193
+ SpectrogramUpsampler() if not self.params.unconditional else None
194
+ )
202
195
 
203
196
  self.residual_layers = nn.ModuleList(
204
197
  [
@@ -212,18 +205,18 @@ class DiffWave(Model):
212
205
  for i in range(params.residual_layers)
213
206
  ]
214
207
  )
215
- self.skip_projection = Conv1dEXT(
208
+ self.skip_projection = ConvEXT(
216
209
  in_channels=params.residual_channels,
217
210
  out_channels=params.residual_channels,
218
211
  kernel_size=1,
219
212
  apply_norm=self.params.apply_norm,
213
+ activation_out=nn.LeakyReLU(0.1),
220
214
  )
221
- self.output_projection = Conv1dEXT(
222
- params.residual_channels, 1, 1, apply_norm=self.params.apply_norm
215
+ self.output_projection = ConvEXT(
216
+ params.residual_channels, 1, 1, apply_norm=self.params.apply_norm, init_weights=True,
223
217
  )
224
218
  self.activation = nn.LeakyReLU(0.1)
225
- self.r_sqrt = sqrt(len(self.residual_layers))
226
- nn.init.zeros_(self.output_projection.weight)
219
+ self._res_d = sqrt(len(self.residual_layers))
227
220
 
228
221
  def forward(
229
222
  self,
@@ -231,31 +224,25 @@ class DiffWave(Model):
231
224
  diffusion_step: Tensor,
232
225
  spectrogram: Optional[Tensor] = None,
233
226
  ):
234
- T = x.shape[-1]
235
- if x.ndim == 2:
236
- x = audio.unsqueeze(1)
237
- x = self.activation(self.input_projection(x))
227
+ if not self.params.unconditional:
228
+ assert spectrogram is not None
229
+ if audio.ndim < 3:
230
+ if audio.ndim == 2:
231
+ audio = audio.unsqueeze(1)
232
+ else:
233
+ audio = audio.unsqueeze(0).unsqueeze(0)
238
234
 
235
+ x = self.input_projection(audio)
239
236
  diffusion_step = self.diffusion_embedding(diffusion_step)
240
- if spectrogram is not None and self.spectrogram_upsample is not None:
241
- if self.auto_interpolate:
242
- # a little heavy, but helps a lot to fix mismatched shapes,
243
- # not always recommended due to data loss
244
- spectrogram = F.interpolate(
245
- input=spectrogram,
246
- size=int(T * self.n_hop),
247
- mode=self.interpolate_mode,
248
- )
249
- spectrogram = self.spectrogram_upsample(spectrogram)
237
+ if not self.params.unconditional: # use conditional model
238
+ spectrogram = self.spectrogram_upsampler(spectrogram)
250
239
 
251
- skip = None
240
+ skip = torch.zeros_like(x, device=x.device)
252
241
  for i, layer in enumerate(self.residual_layers):
253
242
  x, skip_connection = layer(x, diffusion_step, spectrogram)
254
- if i == 0:
255
- skip = skip_connection
256
- else:
257
- skip = skip_connection + skip
258
- x = skip / self.r_sqrt
259
- x = self.activation(self.skip_projection(x))
243
+ skip += skip_connection
244
+
245
+ x = skip / self._res_d
246
+ x = self.skip_projection(x)
260
247
  x = self.output_projection(x)
261
248
  return x
@@ -1,4 +1,4 @@
1
- __all__ = ["ConvNets", "Conv1dEXT"]
1
+ __all__ = ["ConvNets", "ConvEXT"]
2
2
  import math
3
3
  from lt_utils.common import *
4
4
  import torch.nn.functional as F
@@ -6,6 +6,7 @@ from lt_tensor.torch_commons import *
6
6
  from lt_tensor.model_base import Model
7
7
  from lt_tensor.misc_utils import log_tensor
8
8
  from lt_tensor.model_zoo.fusion import AdaFusion1D, AdaIN1D
9
+ from lt_utils.misc_utils import default
9
10
 
10
11
 
11
12
  def spectral_norm_select(module: nn.Module, enabled: bool):
@@ -52,10 +53,7 @@ class ConvNets(Model):
52
53
  m.weight.data.normal_(mean, std)
53
54
 
54
55
 
55
- class Conv1dEXT(ConvNets):
56
-
57
- # TODO: Use this module to replace all that are using normalizations, mostly those in `audio_models`
58
-
56
+ class ConvEXT(ConvNets):
59
57
  def __init__(
60
58
  self,
61
59
  in_channels: int,
@@ -72,6 +70,10 @@ class Conv1dEXT(ConvNets):
72
70
  apply_norm: Optional[Literal["weight", "spectral"]] = None,
73
71
  activation_in: nn.Module = nn.Identity(),
74
72
  activation_out: nn.Module = nn.Identity(),
73
+ module_type: Literal["1d", "2d", "3d"] = "1d",
74
+ transpose: bool = False,
75
+ weight_init: Optional[Callable[[nn.Module], None]] = None,
76
+ init_weights: bool = True,
75
77
  *args,
76
78
  **kwargs,
77
79
  ):
@@ -91,23 +93,30 @@ class Conv1dEXT(ConvNets):
91
93
  device=device,
92
94
  dtype=dtype,
93
95
  )
96
+ match module_type.lower():
97
+ case "1d":
98
+ md = nn.Conv1d if not transpose else nn.ConvTranspose1d
99
+ case "2d":
100
+ md = nn.Conv2d if not transpose else nn.ConvTranspose2d
101
+ case "3d":
102
+ md = nn.Conv3d if not transpose else nn.ConvTranspose3d
103
+ case _:
104
+ raise ValueError(
105
+ f"module_type {module_type} is not a valid module type! use '1d', '2d' or '3d'"
106
+ )
107
+
94
108
  if apply_norm is None:
95
- self.cnn = nn.Conv1d(**cnn_kwargs)
96
- self.has_wn = False
109
+ self.cnn = md(**cnn_kwargs)
97
110
  else:
98
- self.has_wn = True
99
111
  if apply_norm == "spectral":
100
- self.cnn = spectral_norm(nn.Conv1d(**cnn_kwargs))
112
+ self.cnn = spectral_norm(md(**cnn_kwargs))
101
113
  else:
102
- self.cnn = weight_norm(nn.Conv1d(**cnn_kwargs))
114
+ self.cnn = weight_norm(md(**cnn_kwargs))
103
115
  self.actv_in = activation_in
104
116
  self.actv_out = activation_out
105
- self.cnn.apply(self.init_weights)
117
+ if init_weights:
118
+ weight_init = default(weight_init, self.init_weights)
119
+ self.cnn.apply(weight_init)
106
120
 
107
121
  def forward(self, input: Tensor):
108
122
  return self.actv_out(self.cnn(self.actv_in(input)))
109
-
110
- def remove_norms(self, name="weight"):
111
- if self.has_wn:
112
- remove_norm(self.cnn, name)
113
- self.has_wn = False
@@ -0,0 +1,116 @@
1
+ """ Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/envelope.py
2
+ MIT License
3
+
4
+ Copyright (c) 2025 Taseoo Park
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+ """
24
+ from lt_utils.common import *
25
+ from lt_tensor.torch_commons import *
26
+ from lt_tensor.model_base import Model
27
+
28
+
29
+ class Envelope(Model):
30
+ def __init__(self, max_freq: int, sample_rate: Number = 24000, cut_off: int = 0):
31
+ super().__init__()
32
+ self.sr = sample_rate
33
+ self.max_freq = max_freq
34
+ self.setup_low_pass_fn(max_freq, cut_off)
35
+
36
+ def forward(self, x: torch.Tensor):
37
+ if not self.max_freq:
38
+ return x
39
+ return self.lp_fn(x)
40
+
41
+ def _ft_signal(self, signal: torch.Tensor):
42
+ filtered_signal = self.butterwort_lowpass_filter(signal)
43
+ return torch.abs(self.hilbert(filtered_signal))
44
+
45
+ def setup_low_pass_fn(self, max_freq: int, cutoff: int = 0):
46
+ self.max_freq = int(max_freq)
47
+ cutoff = self.max_freq if cutoff == 0 else cutoff
48
+ self.lp_fn = self.hilbert if self.max_freq in [-1, 1] else self._ft_signal
49
+ self.setup_butterwort_lowpass_coefficients(cutoff)
50
+
51
+ def hilbert(self, signal: Tensor) -> Tensor:
52
+ """Implementing the Hilbert transform manually"""
53
+ N = signal.shape[2] # Signal length
54
+ FFT_signal = torch.fft.fft(signal, axis=2)
55
+ h = torch.zeros_like(
56
+ signal
57
+ ) # Generate an array with the same shape as the signal
58
+
59
+ if N % 2 == 0:
60
+ h[:, 0, 0] = 1
61
+ h[:, 0, N // 2] = 1
62
+ h[:, 0, 1 : N // 2] = 2
63
+ else:
64
+ h[:, 0, 0] = 1
65
+ h[:, 0, 1 : (N + 1) // 2] = 2
66
+
67
+ out: Tensor = torch.fft.ifft(FFT_signal * h, axis=2)
68
+ if self.max_freq == -1:
69
+ return -out.abs()
70
+ return -out.abs()
71
+
72
+ def butterwort_lowpass_filter(self, signal):
73
+ filtered_signal = torch.zeros_like(signal)
74
+ # Applying the filter to the signal
75
+ for n in range(len(signal)):
76
+ if n < 2:
77
+ filtered_signal[n] = self.lp_coef_a[0] * signal[n]
78
+ else:
79
+ filtered_signal[n] = (
80
+ self.lp_coef_b[0] * signal[n]
81
+ + self.lp_coef_b[1] * signal[n - 1]
82
+ + self.lp_coef_b[2] * signal[n - 2]
83
+ - self.lp_coef_a[1] * filtered_signal[n - 1]
84
+ - self.lp_coef_a[2] * filtered_signal[n - 2]
85
+ )
86
+
87
+ return filtered_signal
88
+
89
+ def setup_butterwort_lowpass_coefficients(self, cutoff: int):
90
+ cutoff = torch.tensor([cutoff], dtype=torch.float64)
91
+ fs = torch.tensor([self.sr], dtype=torch.float64)
92
+
93
+ omega = torch.tan(torch.pi * cutoff / fs)
94
+ # Convert float 2 to tensor
95
+ sqrt2 = torch.tensor(2.0, dtype=torch.float64).sqrt()
96
+
97
+ sq_omega = sqrt2 * omega + omega**2
98
+ # Transfer function coefficients using the bilinear transform
99
+ a = 2 * (omega**2 - 1) / (1 + sq_omega)
100
+ self.register_buffer(
101
+ "lp_coef_a",
102
+ torch.tensor(
103
+ [1.0, a.item(), ((1 - sq_omega) / (1 + sq_omega)).item()],
104
+ dtype=torch.float64,
105
+ device=self.device,
106
+ ),
107
+ )
108
+ b = omega**2 / (1 + sq_omega)
109
+ self.register_buffer(
110
+ "lp_coef_b",
111
+ torch.tensor(
112
+ [b.item(), (2 * b).item(), b.item()],
113
+ dtype=torch.float64,
114
+ device=self.device,
115
+ ),
116
+ )
@@ -7,6 +7,7 @@ from lt_tensor.model_base import Model
7
7
  from lt_tensor.model_zoo.convs import ConvNets
8
8
  from torch.nn import functional as F
9
9
  from torchaudio import transforms as T
10
+ from lt_tensor.model_zoo.losses._envelope_disc import Envelope
10
11
 
11
12
  MULTI_DISC_OUT_TYPE: TypeAlias = Tuple[
12
13
  List[Tensor],
@@ -313,7 +314,7 @@ class DiscriminatorS(ConvNets):
313
314
  return x.flatten(1, -1), fmap
314
315
 
315
316
 
316
- class MultiScaleDiscriminator(ConvNets):
317
+ class MultiScaleDiscriminator(_MultiDiscriminatorT):
317
318
  def __init__(
318
319
  self,
319
320
  discriminator_channel_multi: Number = 1,
@@ -352,102 +353,71 @@ class MultiScaleDiscriminator(ConvNets):
352
353
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
353
354
 
354
355
 
355
- class EnvelopeExtractor(Model):
356
- """Extracts the amplitude envelope of the audio signal."""
356
+ class DiscriminatorE(ConvNets):
357
+ """Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/models.py"""
357
358
 
358
- def __init__(self, kernel_size=101):
359
- super().__init__()
360
- # Lowpass filter for smoothing envelope (moving average)
361
- self.kernel_size = kernel_size
362
- self.register_buffer("kernel", torch.ones(1, 1, kernel_size) / kernel_size)
363
-
364
- def forward(self, x: Tensor):
365
- # x: (B, 1, T) -> abs(x)
366
- envelope = torch.abs(x)
367
- # Apply low-pass smoothing (via conv1d)
368
- envelope = F.pad(
369
- envelope, (self.kernel_size // 2, self.kernel_size // 2), mode="reflect"
370
- )
371
- envelope = F.conv1d(envelope, self.kernel)
372
- return envelope
373
-
374
-
375
- class DiscriminatorEnvelope(ConvNets):
376
359
  def __init__(
377
360
  self,
378
- use_spectral_norm=False,
361
+ max_freq: int,
379
362
  discriminator_channel_multi: Number = 1,
380
- kernel_size: int = 101,
363
+ sample_rate: int = 24000,
364
+ use_spectral_norm: bool = False,
381
365
  ):
366
+
382
367
  super().__init__()
383
- norm_f = weight_norm if not use_spectral_norm else spectral_norm
384
- self.extractor = EnvelopeExtractor(kernel_size=kernel_size)
368
+ self.max_freq = max_freq
369
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
385
370
  dsc = lambda x: int(x * discriminator_channel_multi)
386
371
  self.convs = nn.ModuleList(
387
372
  [
388
- norm_f(nn.Conv1d(1, dsc(64), 15, stride=1, padding=7)),
389
- norm_f(
390
- nn.Conv1d(dsc(64), dsc(128), 41, stride=2, groups=4, padding=20)
391
- ),
392
- norm_f(
393
- nn.Conv1d(dsc(128), dsc(256), 41, stride=2, groups=16, padding=20)
394
- ),
395
- norm_f(
396
- nn.Conv1d(dsc(256), dsc(512), 41, stride=4, groups=16, padding=20)
397
- ),
398
- norm_f(
399
- nn.Conv1d(dsc(512), dsc(512), 41, stride=4, groups=16, padding=20)
400
- ),
401
- norm_f(nn.Conv1d(dsc(512), dsc(512), 5, stride=1, padding=2)),
373
+ norm_f(nn.Conv1d(1, dsc(128), 15, 1, padding=7)),
374
+ norm_f(nn.Conv1d(dsc(128), dsc(128), 41, 2, groups=4, padding=20)),
375
+ norm_f(nn.Conv1d(dsc(128), dsc(256), 41, 2, groups=16, padding=20)),
376
+ norm_f(nn.Conv1d(dsc(256), dsc(512), 41, 4, groups=16, padding=20)),
377
+ norm_f(nn.Conv1d(dsc(512), dsc(1024), 41, 4, groups=16, padding=20)),
378
+ norm_f(nn.Conv1d(dsc(1024), dsc(1024), 41, 1, groups=16, padding=20)),
379
+ norm_f(nn.Conv1d(dsc(1024), dsc(1024), 5, 1, padding=2)),
402
380
  ]
403
381
  )
404
- self.conv_post = norm_f(nn.Conv1d(dsc(512), 1, 3, stride=1, padding=1))
382
+ self.conv_post = norm_f(nn.Conv1d(dsc(1024), 1, 3, 1, padding=1))
383
+ self.envelope = Envelope(max_freq=self.max_freq, sample_rate=sample_rate)
405
384
  self.activation = nn.LeakyReLU(0.1)
406
385
 
407
- def forward(self, x):
408
- # Input: raw audio (B, 1, T)
409
- x = self.extractor(x)
386
+ def forward(self, x: Tensor):
410
387
  fmap = []
411
- for layer in self.convs:
412
- x = self.activation(layer(x))
388
+ for l in self.convs:
389
+ x = self.envelope(x)
390
+ x = self.activation(l(x))
413
391
  fmap.append(x)
414
392
  x = self.conv_post(x)
415
393
  fmap.append(x)
416
- return x.flatten(1), fmap
394
+ return x.flatten(start_dim=1, end_dim=-1), fmap
417
395
 
418
396
 
419
397
  class MultiEnvelopeDiscriminator(_MultiDiscriminatorT):
398
+ """Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/models.py"""
399
+
420
400
  def __init__(
421
401
  self,
422
- use_spectral_norm: bool = False,
423
402
  discriminator_channel_multi: Number = 1,
424
403
  ):
425
404
  super().__init__()
405
+ f_times_values = [-1, 0, 1, 300, 500]
426
406
  self.discriminators = nn.ModuleList(
427
- [
428
- DiscriminatorEnvelope(
429
- use_spectral_norm, discriminator_channel_multi
430
- ), # raw envelope
431
- DiscriminatorEnvelope(use_spectral_norm), # downsampled once
432
- DiscriminatorEnvelope(use_spectral_norm), # downsampled twice
433
- ]
434
- )
435
- self.meanpools = nn.ModuleList(
436
- [nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
407
+ [DiscriminatorE(f, discriminator_channel_multi) for f in f_times_values]
437
408
  )
438
409
 
439
410
  def forward(self, y, y_hat):
440
- y_d_rs, y_d_gs = [], []
441
- fmap_rs, fmap_gs = [], []
442
- for i, d in enumerate(self.discriminators):
443
- if i != 0:
444
- y = self.meanpools[i - 1](y)
445
- y_hat = self.meanpools[i - 1](y_hat)
411
+ y_d_rs = []
412
+ y_d_gs = []
413
+ fmap_rs = []
414
+ fmap_gs = []
415
+ for d in self.discriminators:
446
416
  y_d_r, fmap_r = d(y)
447
417
  y_d_g, fmap_g = d(y_hat)
448
418
  y_d_rs.append(y_d_r)
449
- y_d_gs.append(y_d_g)
450
419
  fmap_rs.append(fmap_r)
420
+ y_d_gs.append(y_d_g)
451
421
  fmap_gs.append(fmap_g)
452
422
 
453
423
  return y_d_rs, y_d_gs, fmap_rs, fmap_gs
@@ -77,7 +77,7 @@ class AudioProcessorConfig(ModelConfig):
77
77
  def post_process(self):
78
78
  self.n_stft = self.n_fft // 2 + 1
79
79
  # some functions needs this to be a non-zero or not None value.
80
- self.f_min = max(self.f_min, (self.sample_rate / (self.n_fft - 1)) * 2)
80
+ self.default_f_min = max(self.f_min, (self.sample_rate / (self.n_fft - 1)) * 2)
81
81
  self.default_f_max = min(
82
82
  default(self.f_max, self.sample_rate // 2), self.sample_rate // 2
83
83
  )
@@ -202,6 +202,8 @@ class AudioProcessor(Model):
202
202
  *args,
203
203
  **kwargs,
204
204
  ):
205
+ if wave.ndim == 1:
206
+ wave = wave.unsqueeze(0)
205
207
  wave = torch.nn.functional.pad(
206
208
  wave.unsqueeze(1),
207
209
  (self.mel_lib_padding, self.mel_lib_padding),
@@ -352,7 +354,7 @@ class AudioProcessor(Model):
352
354
  sr = default(sr, self.cfg.sample_rate)
353
355
  frame_length = default(frame_length, self.cfg.n_fft)
354
356
  fmin = max(
355
- default(fmin, self.cfg.f_min), self.calc_pitch_fmin(sr, frame_length)
357
+ default(fmin, self.cfg.default_f_min), self.calc_pitch_fmin(sr, frame_length)
356
358
  )
357
359
  fmax = min(max(default(fmax, self.cfg.default_f_max), fmin + 1), sr // 2)
358
360
  hop_length = default(hop_length, self.cfg.hop_length)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lt-tensor
3
- Version: 0.0.1a37
3
+ Version: 0.0.1a39
4
4
  Summary: General utilities for PyTorch and others. Built for general use.
5
5
  Home-page: https://github.com/gr1336/lt-tensor/
6
6
  Author: gr1336
@@ -42,5 +42,6 @@ lt_tensor/model_zoo/losses/discriminators.py
42
42
  lt_tensor/model_zoo/losses/CQT/__init__.py
43
43
  lt_tensor/model_zoo/losses/CQT/transforms.py
44
44
  lt_tensor/model_zoo/losses/CQT/utils.py
45
+ lt_tensor/model_zoo/losses/_envelope_disc/__init__.py
45
46
  lt_tensor/processors/__init__.py
46
47
  lt_tensor/processors/audio.py
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as f:
4
4
  long_description = f.read()
5
5
 
6
6
  setup(
7
- version="0.0.1a37",
7
+ version="0.0.1a39",
8
8
  name="lt-tensor",
9
9
  description="General utilities for PyTorch and others. Built for general use.",
10
10
  long_description=long_description,
File without changes
File without changes
File without changes