lt-tensor 0.0.1a35__py3-none-any.whl → 0.0.1a36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lt_tensor/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.0.1a35"
1
+ __version__ = "0.0.1a36"
2
2
 
3
3
  from . import (
4
4
  lr_schedulers,
@@ -10,9 +10,9 @@ from lt_utils.file_ops import load_json, is_file, is_dir, is_path_valid
10
10
  class BigVGANConfig(ModelConfig):
11
11
  # Training params
12
12
  in_channels: int = 80
13
- upsample_rates: List[Union[int, List[int]]] = [8, 8]
14
- upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16]
15
- upsample_initial_channel: int = 512
13
+ upsample_rates: List[Union[int, List[int]]] = [4, 4, 2, 2, 2, 2]
14
+ upsample_kernel_sizes: List[Union[int, List[int]]] = [8, 8, 4, 4, 4, 4]
15
+ upsample_initial_channel: int = 1536
16
16
  resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11]
17
17
  resblock_dilation_sizes: List[Union[int, List[int]]] = [
18
18
  [1, 3, 5],
@@ -30,9 +30,9 @@ class BigVGANConfig(ModelConfig):
30
30
  def __init__(
31
31
  self,
32
32
  in_channels: int = 80,
33
- upsample_rates: List[Union[int, List[int]]] = [8, 8],
34
- upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16],
35
- upsample_initial_channel: int = 512,
33
+ upsample_rates: List[Union[int, List[int]]] = [4, 4, 2, 2, 2, 2],
34
+ upsample_kernel_sizes: List[Union[int, List[int]]] = [8, 8, 4, 4, 4, 4],
35
+ upsample_initial_channel: int = 1536,
36
36
  resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
37
37
  resblock_dilation_sizes: List[Union[int, List[int]]] = [
38
38
  [1, 3, 5],
@@ -42,8 +42,8 @@ class BigVGANConfig(ModelConfig):
42
42
  activation: Literal["snake", "snakebeta"] = "snakebeta",
43
43
  resblock_activation: Literal["snake", "snakebeta"] = "snakebeta",
44
44
  resblock: Union[int, str] = "1",
45
- use_bias_at_final: bool = True,
46
- use_tanh_at_final: bool = True,
45
+ use_bias_at_final: bool = False,
46
+ use_tanh_at_final: bool = False,
47
47
  *args,
48
48
  **kwargs,
49
49
  ):
@@ -69,7 +69,7 @@ class BigVGANConfig(ModelConfig):
69
69
 
70
70
  class BigVGAN(ConvNets):
71
71
  """Modified from 'https://github.com/NVIDIA/BigVGAN/blob/main/bigvgan.py' under mit license.
72
-
72
+
73
73
  BigVGAN is a neural vocoder model that applies anti-aliased periodic activation for residual blocks (resblocks).
74
74
  New in BigVGAN-v2: it can optionally use optimized CUDA kernels for AMP (anti-aliased multi-periodicity) blocks.
75
75
 
@@ -146,7 +146,7 @@ class BigVGAN(ConvNets):
146
146
  self.conv_post.apply(self.init_weights)
147
147
 
148
148
  # Final tanh activation. Defaults to True for backward compatibility
149
- self.use_tanh_at_final = cfg.get("use_tanh_at_final", True)
149
+ self.use_tanh_at_final = cfg.use_tanh_at_final
150
150
 
151
151
  def forward(self, x):
152
152
  # Pre-conv
@@ -16,15 +16,11 @@ def get_padding(kernel_size, dilation=1):
16
16
  class HifiganConfig(ModelConfig):
17
17
  # Training params
18
18
  in_channels: int = 80
19
- upsample_rates: List[Union[int, List[int]]] = [8, 8]
20
- upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16]
19
+ upsample_rates: List[Union[int, List[int]]] = [8,8,2,2]
20
+ upsample_kernel_sizes: List[Union[int, List[int]]] = [16,16,4,4]
21
21
  upsample_initial_channel: int = 512
22
22
  resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11]
23
- resblock_dilation_sizes: List[Union[int, List[int]]] = [
24
- [1, 3, 5],
25
- [1, 3, 5],
26
- [1, 3, 5],
27
- ]
23
+ resblock_dilation_sizes: List[Union[int, List[int]]] = [[1,3,5], [1,3,5], [1,3,5]]
28
24
 
29
25
  activation: nn.Module = nn.LeakyReLU(0.1)
30
26
  resblock_activation: nn.Module = nn.LeakyReLU(0.1)
@@ -33,10 +29,10 @@ class HifiganConfig(ModelConfig):
33
29
  def __init__(
34
30
  self,
35
31
  in_channels: int = 80,
36
- upsample_rates: List[Union[int, List[int]]] = [8, 8],
37
- upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16],
32
+ upsample_rates: List[Union[int, List[int]]] = [8,8,2,2],
33
+ upsample_kernel_sizes: List[Union[int, List[int]]] = [16,16,4,4],
38
34
  upsample_initial_channel: int = 512,
39
- resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
35
+ resblock_kernel_sizes: List[Union[int, List[int]]] = [3,7,11],
40
36
  resblock_dilation_sizes: List[Union[int, List[int]]] = [
41
37
  [1, 3, 5],
42
38
  [1, 3, 5],
File without changes
@@ -0,0 +1,336 @@
1
+ from lt_utils.common import *
2
+ from lt_tensor.torch_commons import *
3
+ import numpy as np
4
+ from time import time
5
+ from lt_tensor.model_zoo.losses.CQT.utils import *
6
+
7
+
8
+ class CQT2010v2(nn.Module):
9
+ """This function is to calculate the CQT of the input signal.
10
+ Input signal should be in either of the following shapes.\n
11
+ 1. ``(len_audio)``\n
12
+ 2. ``(num_audio, len_audio)``\n
13
+ 3. ``(num_audio, 1, len_audio)``
14
+
15
+ The correct shape will be inferred autommatically if the input follows these 3 shapes.
16
+ Most of the arguments follow the convention from librosa.
17
+ This class inherits from ``nn.Module``, therefore, the usage is same as ``nn.Module``.
18
+
19
+ This alogrithm uses the resampling method proposed in [1].
20
+ Instead of convoluting the STFT results with a gigantic CQT kernel covering the full frequency
21
+ spectrum, we make a small CQT kernel covering only the top octave. Then we keep downsampling the
22
+ input audio by a factor of 2 to convoluting it with the small CQT kernel.
23
+ Everytime the input audio is downsampled, the CQT relative to the downsampled input is equivalent
24
+ to the next lower octave.
25
+ The kernel creation process is still same as the 1992 algorithm. Therefore, we can reuse the
26
+ code from the 1992 alogrithm [2]
27
+ [1] Schörkhuber, Christian. “CONSTANT-Q TRANSFORM TOOLBOX FOR MUSIC PROCESSING.” (2010).
28
+ [2] Brown, Judith C.C. and Miller Puckette. “An efficient algorithm for the calculation of a
29
+ constant Q transform.” (1992).
30
+
31
+ Early downsampling factor is to downsample the input audio to reduce the CQT kernel size.
32
+ The result with and without early downsampling are more or less the same except in the very low
33
+ frequency region where freq < 40Hz.
34
+
35
+ Parameters
36
+ ----------
37
+ sr : int
38
+ The sampling rate for the input audio. It is used to calucate the correct ``fmin`` and ``fmax``.
39
+ Setting the correct sampling rate is very important for calculating the correct frequency.
40
+
41
+ hop_length : int
42
+ The hop (or stride) size. Default value is 512.
43
+
44
+ fmin : float
45
+ The frequency for the lowest CQT bin. Default is 32.70Hz, which coresponds to the note C0.
46
+
47
+
48
+ n_bins : int
49
+ The total numbers of CQT bins. Default is 84. Will be ignored if ``fmax`` is not ``None``.
50
+
51
+ bins_per_octave : int
52
+ Number of bins per octave. Default is 12.
53
+
54
+ norm : bool
55
+ Normalization for the CQT result.
56
+
57
+ basis_norm : int
58
+ Normalization for the CQT kernels. ``1`` means L1 normalization, and ``2`` means L2 normalization.
59
+ Default is ``1``, which is same as the normalization used in librosa.
60
+
61
+ window : str
62
+ The windowing function for CQT. It uses ``scipy.signal.get_window``, please refer to
63
+ scipy documentation for possible windowing functions. The default value is 'hann'
64
+
65
+ pad_mode : str
66
+ The padding method. Default value is 'reflect'.
67
+
68
+ trainable : bool
69
+ Determine if the CQT kernels are trainable or not. If ``True``, the gradients for CQT kernels
70
+ will also be caluclated and the CQT kernels will be updated during model training.
71
+ Default value is ``False``
72
+
73
+ output_format : str
74
+ Determine the return type.
75
+ 'Magnitude' will return the magnitude of the STFT result, shape = ``(num_samples, freq_bins, time_steps)``;
76
+ 'Complex' will return the STFT result in complex number, shape = ``(num_samples, freq_bins, time_steps, 2)``;
77
+ 'Phase' will return the phase of the STFT reuslt, shape = ``(num_samples, freq_bins,time_steps, 2)``.
78
+ The complex number is stored as ``(real, imag)`` in the last axis. Default value is 'Magnitude'.
79
+
80
+ verbose : bool
81
+ If ``True``, it shows layer information. If ``False``, it suppresses all prints.
82
+
83
+ Returns
84
+ -------
85
+ spectrogram : torch.tensor
86
+ It returns a tensor of spectrograms.
87
+ shape = ``(num_samples, freq_bins,time_steps)`` if ``output_format='Magnitude'``;
88
+ shape = ``(num_samples, freq_bins,time_steps, 2)`` if ``output_format='Complex' or 'Phase'``;
89
+
90
+ Examples
91
+ --------
92
+ >>> spec_layer = Spectrogram.CQT2010v2()
93
+ >>> specs = spec_layer(x)
94
+ """
95
+
96
+ # To DO:
97
+ # need to deal with the filter and other tensors
98
+
99
+ def __init__(
100
+ self,
101
+ sr=22050,
102
+ hop_length=512,
103
+ fmin=32.70,
104
+ fmax=None,
105
+ n_bins=84,
106
+ filter_scale=1,
107
+ bins_per_octave=12,
108
+ norm=True,
109
+ basis_norm=1,
110
+ pad_mode="reflect",
111
+ earlydownsample=True,
112
+ trainable=False,
113
+ output_format="Magnitude",
114
+ verbose=False,
115
+ ):
116
+
117
+ super().__init__()
118
+
119
+ self.norm = (
120
+ norm # Now norm is used to normalize the final CQT result by dividing n_fft
121
+ )
122
+ # basis_norm is for normalizing basis
123
+ self.hop_length = hop_length
124
+ self.pad_mode = pad_mode
125
+ self.n_bins = n_bins
126
+ self.earlydownsample = (
127
+ earlydownsample # We will activate early downsampling later if possible
128
+ )
129
+ self.trainable = trainable
130
+ self.output_format = output_format
131
+
132
+ # It will be used to calculate filter_cutoff and creating CQT kernels
133
+ Q = float(filter_scale) / (2 ** (1 / bins_per_octave) - 1)
134
+
135
+ # Creating lowpass filter and make it a torch tensor
136
+ if verbose:
137
+ print("Creating low pass filter ...", end="\r")
138
+ start = time()
139
+ # self.lowpass_filter = torch.tensor(
140
+ # create_lowpass_filter(
141
+ # band_center = 0.50,
142
+ # kernelLength=256,
143
+ # transitionBandwidth=0.001))
144
+ lowpass_filter = torch.tensor(
145
+ create_lowpass_filter(
146
+ band_center=0.50, kernelLength=256, transitionBandwidth=0.001
147
+ )
148
+ )
149
+
150
+ # Broadcast the tensor to the shape that fits conv1d
151
+ self.register_buffer("lowpass_filter", lowpass_filter[None, None, :])
152
+ if verbose:
153
+ print(
154
+ "Low pass filter created, time used = {:.4f} seconds".format(
155
+ time() - start
156
+ )
157
+ )
158
+
159
+ # Caluate num of filter requires for the kernel
160
+ # n_octaves determines how many resampling requires for the CQT
161
+ n_filters = min(bins_per_octave, n_bins)
162
+ self.n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
163
+ if verbose:
164
+ print("num_octave = ", self.n_octaves)
165
+
166
+ # Calculate the lowest frequency bin for the top octave kernel
167
+ self.fmin_t = fmin * 2 ** (self.n_octaves - 1)
168
+ remainder = n_bins % bins_per_octave
169
+ # print("remainder = ", remainder)
170
+
171
+ if remainder == 0:
172
+ # Calculate the top bin frequency
173
+ fmax_t = self.fmin_t * 2 ** ((bins_per_octave - 1) / bins_per_octave)
174
+ else:
175
+ # Calculate the top bin frequency
176
+ fmax_t = self.fmin_t * 2 ** ((remainder - 1) / bins_per_octave)
177
+
178
+ self.fmin_t = fmax_t / 2 ** (
179
+ 1 - 1 / bins_per_octave
180
+ ) # Adjusting the top minium bins
181
+ if fmax_t > sr / 2:
182
+ raise ValueError(
183
+ "The top bin {}Hz has exceeded the Nyquist frequency, \
184
+ please reduce the n_bins".format(
185
+ fmax_t
186
+ )
187
+ )
188
+
189
+ if (
190
+ self.earlydownsample == True
191
+ ): # Do early downsampling if this argument is True
192
+ if verbose:
193
+ print("Creating early downsampling filter ...", end="\r")
194
+ start = time()
195
+ (
196
+ sr,
197
+ self.hop_length,
198
+ self.downsample_factor,
199
+ early_downsample_filter,
200
+ self.earlydownsample,
201
+ ) = get_early_downsample_params(
202
+ sr, hop_length, fmax_t, Q, self.n_octaves, verbose
203
+ )
204
+ self.register_buffer("early_downsample_filter", early_downsample_filter)
205
+
206
+ if verbose:
207
+ print(
208
+ "Early downsampling filter created, \
209
+ time used = {:.4f} seconds".format(
210
+ time() - start
211
+ )
212
+ )
213
+ else:
214
+ self.downsample_factor = 1.0
215
+
216
+ # Preparing CQT kernels
217
+ if verbose:
218
+ print("Creating CQT kernels ...", end="\r")
219
+ start = time()
220
+ basis, self.n_fft, lenghts, _ = create_cqt_kernels(
221
+ Q,
222
+ sr,
223
+ self.fmin_t,
224
+ n_filters,
225
+ bins_per_octave,
226
+ norm=basis_norm,
227
+ topbin_check=False,
228
+ )
229
+ # For normalization in the end
230
+ # The freqs returned by create_cqt_kernels cannot be used
231
+ # Since that returns only the top octave bins
232
+ # We need the information for all freq bin
233
+ freqs = fmin * 2.0 ** (np.r_[0:n_bins] / np.double(bins_per_octave))
234
+ self.frequencies = freqs
235
+
236
+ lenghts = np.ceil(Q * sr / freqs)
237
+ lenghts = torch.tensor(lenghts).float()
238
+ self.register_buffer("lenghts", lenghts)
239
+
240
+ self.basis = basis
241
+ # These cqt_kernel is already in the frequency domain
242
+ cqt_kernels_real = torch.tensor(basis.real).unsqueeze(1)
243
+ cqt_kernels_imag = torch.tensor(basis.imag).unsqueeze(1)
244
+
245
+ if trainable:
246
+ cqt_kernels_real = nn.Parameter(cqt_kernels_real, requires_grad=trainable)
247
+ cqt_kernels_imag = nn.Parameter(cqt_kernels_imag, requires_grad=trainable)
248
+ self.register_parameter("cqt_kernels_real", cqt_kernels_real)
249
+ self.register_parameter("cqt_kernels_imag", cqt_kernels_imag)
250
+ else:
251
+ self.register_buffer("cqt_kernels_real", cqt_kernels_real)
252
+ self.register_buffer("cqt_kernels_imag", cqt_kernels_imag)
253
+
254
+ if verbose:
255
+ print(
256
+ "CQT kernels created, time used = {:.4f} seconds".format(time() - start)
257
+ )
258
+ # print("Getting cqt kernel done, n_fft = ",self.n_fft)
259
+
260
+ # If center==True, the STFT window will be put in the middle, and paddings at the beginning
261
+ # and ending are required.
262
+ if self.pad_mode == "constant":
263
+ self.padding = nn.ConstantPad1d(self.n_fft // 2, 0)
264
+ elif self.pad_mode == "reflect":
265
+ self.padding = nn.ReflectionPad1d(self.n_fft // 2)
266
+
267
+ def forward(self, x, output_format=None, normalization_type="librosa"):
268
+ """
269
+ Convert a batch of waveforms to CQT spectrograms.
270
+
271
+ Parameters
272
+ ----------
273
+ x : torch tensor
274
+ Input signal should be in either of the following shapes.\n
275
+ 1. ``(len_audio)``\n
276
+ 2. ``(num_audio, len_audio)``\n
277
+ 3. ``(num_audio, 1, len_audio)``
278
+ It will be automatically broadcast to the right shape
279
+ """
280
+ output_format = output_format or self.output_format
281
+
282
+ x = broadcast_dim(x)
283
+ if self.earlydownsample:
284
+ x = downsampling_by_n(
285
+ x, self.early_downsample_filter, self.downsample_factor
286
+ )
287
+ hop = self.hop_length
288
+ CQT = get_cqt_complex(
289
+ x, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding
290
+ ) # Getting the top octave CQT
291
+
292
+ x_down = x # Preparing a new variable for downsampling
293
+
294
+ for i in range(self.n_octaves - 1):
295
+ hop = hop // 2
296
+ x_down = downsampling_by_2(x_down, self.lowpass_filter)
297
+ CQT1 = get_cqt_complex(
298
+ x_down, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding
299
+ )
300
+ CQT = torch.cat((CQT1, CQT), 1)
301
+
302
+ CQT = CQT[:, -self.n_bins :, :] # Removing unwanted bottom bins
303
+ # print("downsample_factor = ",self.downsample_factor)
304
+ # print(CQT.shape)
305
+ # print(self.lenghts.view(-1,1).shape)
306
+
307
+ # Normalizing the output with the downsampling factor, 2**(self.n_octaves-1) is make it
308
+ # same mag as 1992
309
+ CQT = CQT * self.downsample_factor
310
+ # Normalize again to get same result as librosa
311
+ if normalization_type == "librosa":
312
+ CQT = CQT * torch.sqrt(self.lenghts.view(-1, 1, 1))
313
+ elif normalization_type == "convolutional":
314
+ pass
315
+ elif normalization_type == "wrap":
316
+ CQT *= 2
317
+ else:
318
+ raise ValueError(
319
+ "The normalization_type %r is not part of our current options."
320
+ % normalization_type
321
+ )
322
+
323
+ if output_format == "Magnitude":
324
+ if self.trainable == False:
325
+ # Getting CQT Amplitude
326
+ return torch.sqrt(CQT.pow(2).sum(-1))
327
+ else:
328
+ return torch.sqrt(CQT.pow(2).sum(-1) + 1e-8)
329
+
330
+ elif output_format == "Complex":
331
+ return CQT
332
+
333
+ elif output_format == "Phase":
334
+ phase_real = torch.cos(torch.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0]))
335
+ phase_imag = torch.sin(torch.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0]))
336
+ return torch.stack((phase_real, phase_imag), -1)