lt-tensor 0.0.1a38__py3-none-any.whl → 0.0.1a40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lt_tensor/__init__.py +1 -1
- lt_tensor/model_zoo/audio_models/bemaganv2/__init__.py +205 -0
- lt_tensor/model_zoo/audio_models/bigvgan/__init__.py +14 -39
- lt_tensor/model_zoo/audio_models/diffwave/__init__.py +20 -19
- lt_tensor/model_zoo/audio_models/hifigan/__init__.py +24 -44
- lt_tensor/model_zoo/audio_models/istft/__init__.py +15 -39
- lt_tensor/model_zoo/convs.py +35 -4
- lt_tensor/model_zoo/losses/_envelope_disc/__init__.py +116 -0
- lt_tensor/model_zoo/losses/discriminators.py +34 -64
- lt_tensor/noise_tools.py +22 -13
- lt_tensor/processors/audio.py +116 -62
- {lt_tensor-0.0.1a38.dist-info → lt_tensor-0.0.1a40.dist-info}/METADATA +1 -1
- {lt_tensor-0.0.1a38.dist-info → lt_tensor-0.0.1a40.dist-info}/RECORD +16 -14
- {lt_tensor-0.0.1a38.dist-info → lt_tensor-0.0.1a40.dist-info}/WHEEL +0 -0
- {lt_tensor-0.0.1a38.dist-info → lt_tensor-0.0.1a40.dist-info}/licenses/LICENSE +0 -0
- {lt_tensor-0.0.1a38.dist-info → lt_tensor-0.0.1a40.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
|
|
1
|
+
""" Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/envelope.py
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Copyright (c) 2025 Taseoo Park
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
"""
|
24
|
+
from lt_utils.common import *
|
25
|
+
from lt_tensor.torch_commons import *
|
26
|
+
from lt_tensor.model_base import Model
|
27
|
+
|
28
|
+
|
29
|
+
class Envelope(Model):
|
30
|
+
def __init__(self, max_freq: int, sample_rate: Number = 24000, cut_off: int = 0):
|
31
|
+
super().__init__()
|
32
|
+
self.sr = sample_rate
|
33
|
+
self.max_freq = max_freq
|
34
|
+
self.setup_low_pass_fn(max_freq, cut_off)
|
35
|
+
|
36
|
+
def forward(self, x: torch.Tensor):
|
37
|
+
if not self.max_freq:
|
38
|
+
return x
|
39
|
+
return self.lp_fn(x)
|
40
|
+
|
41
|
+
def _ft_signal(self, signal: torch.Tensor):
|
42
|
+
filtered_signal = self.butterwort_lowpass_filter(signal)
|
43
|
+
return torch.abs(self.hilbert(filtered_signal))
|
44
|
+
|
45
|
+
def setup_low_pass_fn(self, max_freq: int, cutoff: int = 0):
|
46
|
+
self.max_freq = int(max_freq)
|
47
|
+
cutoff = self.max_freq if cutoff == 0 else cutoff
|
48
|
+
self.lp_fn = self.hilbert if self.max_freq in [-1, 1] else self._ft_signal
|
49
|
+
self.setup_butterwort_lowpass_coefficients(cutoff)
|
50
|
+
|
51
|
+
def hilbert(self, signal: Tensor) -> Tensor:
|
52
|
+
"""Implementing the Hilbert transform manually"""
|
53
|
+
N = signal.shape[2] # Signal length
|
54
|
+
FFT_signal = torch.fft.fft(signal, axis=2)
|
55
|
+
h = torch.zeros_like(
|
56
|
+
signal
|
57
|
+
) # Generate an array with the same shape as the signal
|
58
|
+
|
59
|
+
if N % 2 == 0:
|
60
|
+
h[:, 0, 0] = 1
|
61
|
+
h[:, 0, N // 2] = 1
|
62
|
+
h[:, 0, 1 : N // 2] = 2
|
63
|
+
else:
|
64
|
+
h[:, 0, 0] = 1
|
65
|
+
h[:, 0, 1 : (N + 1) // 2] = 2
|
66
|
+
|
67
|
+
out: Tensor = torch.fft.ifft(FFT_signal * h, axis=2)
|
68
|
+
if self.max_freq == -1:
|
69
|
+
return -out.abs()
|
70
|
+
return -out.abs()
|
71
|
+
|
72
|
+
def butterwort_lowpass_filter(self, signal):
|
73
|
+
filtered_signal = torch.zeros_like(signal)
|
74
|
+
# Applying the filter to the signal
|
75
|
+
for n in range(len(signal)):
|
76
|
+
if n < 2:
|
77
|
+
filtered_signal[n] = self.lp_coef_a[0] * signal[n]
|
78
|
+
else:
|
79
|
+
filtered_signal[n] = (
|
80
|
+
self.lp_coef_b[0] * signal[n]
|
81
|
+
+ self.lp_coef_b[1] * signal[n - 1]
|
82
|
+
+ self.lp_coef_b[2] * signal[n - 2]
|
83
|
+
- self.lp_coef_a[1] * filtered_signal[n - 1]
|
84
|
+
- self.lp_coef_a[2] * filtered_signal[n - 2]
|
85
|
+
)
|
86
|
+
|
87
|
+
return filtered_signal
|
88
|
+
|
89
|
+
def setup_butterwort_lowpass_coefficients(self, cutoff: int):
|
90
|
+
cutoff = torch.tensor([cutoff], dtype=torch.float64)
|
91
|
+
fs = torch.tensor([self.sr], dtype=torch.float64)
|
92
|
+
|
93
|
+
omega = torch.tan(torch.pi * cutoff / fs)
|
94
|
+
# Convert float 2 to tensor
|
95
|
+
sqrt2 = torch.tensor(2.0, dtype=torch.float64).sqrt()
|
96
|
+
|
97
|
+
sq_omega = sqrt2 * omega + omega**2
|
98
|
+
# Transfer function coefficients using the bilinear transform
|
99
|
+
a = 2 * (omega**2 - 1) / (1 + sq_omega)
|
100
|
+
self.register_buffer(
|
101
|
+
"lp_coef_a",
|
102
|
+
torch.tensor(
|
103
|
+
[1.0, a.item(), ((1 - sq_omega) / (1 + sq_omega)).item()],
|
104
|
+
dtype=torch.float64,
|
105
|
+
device=self.device,
|
106
|
+
),
|
107
|
+
)
|
108
|
+
b = omega**2 / (1 + sq_omega)
|
109
|
+
self.register_buffer(
|
110
|
+
"lp_coef_b",
|
111
|
+
torch.tensor(
|
112
|
+
[b.item(), (2 * b).item(), b.item()],
|
113
|
+
dtype=torch.float64,
|
114
|
+
device=self.device,
|
115
|
+
),
|
116
|
+
)
|
@@ -7,6 +7,7 @@ from lt_tensor.model_base import Model
|
|
7
7
|
from lt_tensor.model_zoo.convs import ConvNets
|
8
8
|
from torch.nn import functional as F
|
9
9
|
from torchaudio import transforms as T
|
10
|
+
from lt_tensor.model_zoo.losses._envelope_disc import Envelope
|
10
11
|
|
11
12
|
MULTI_DISC_OUT_TYPE: TypeAlias = Tuple[
|
12
13
|
List[Tensor],
|
@@ -313,7 +314,7 @@ class DiscriminatorS(ConvNets):
|
|
313
314
|
return x.flatten(1, -1), fmap
|
314
315
|
|
315
316
|
|
316
|
-
class MultiScaleDiscriminator(
|
317
|
+
class MultiScaleDiscriminator(_MultiDiscriminatorT):
|
317
318
|
def __init__(
|
318
319
|
self,
|
319
320
|
discriminator_channel_multi: Number = 1,
|
@@ -352,102 +353,71 @@ class MultiScaleDiscriminator(ConvNets):
|
|
352
353
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
353
354
|
|
354
355
|
|
355
|
-
class
|
356
|
-
"""
|
356
|
+
class DiscriminatorE(ConvNets):
|
357
|
+
"""Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/models.py"""
|
357
358
|
|
358
|
-
def __init__(self, kernel_size=101):
|
359
|
-
super().__init__()
|
360
|
-
# Lowpass filter for smoothing envelope (moving average)
|
361
|
-
self.kernel_size = kernel_size
|
362
|
-
self.register_buffer("kernel", torch.ones(1, 1, kernel_size) / kernel_size)
|
363
|
-
|
364
|
-
def forward(self, x: Tensor):
|
365
|
-
# x: (B, 1, T) -> abs(x)
|
366
|
-
envelope = torch.abs(x)
|
367
|
-
# Apply low-pass smoothing (via conv1d)
|
368
|
-
envelope = F.pad(
|
369
|
-
envelope, (self.kernel_size // 2, self.kernel_size // 2), mode="reflect"
|
370
|
-
)
|
371
|
-
envelope = F.conv1d(envelope, self.kernel)
|
372
|
-
return envelope
|
373
|
-
|
374
|
-
|
375
|
-
class DiscriminatorEnvelope(ConvNets):
|
376
359
|
def __init__(
|
377
360
|
self,
|
378
|
-
|
361
|
+
max_freq: int,
|
379
362
|
discriminator_channel_multi: Number = 1,
|
380
|
-
|
363
|
+
sample_rate: int = 24000,
|
364
|
+
use_spectral_norm: bool = False,
|
381
365
|
):
|
366
|
+
|
382
367
|
super().__init__()
|
383
|
-
|
384
|
-
|
368
|
+
self.max_freq = max_freq
|
369
|
+
norm_f = spectral_norm if use_spectral_norm else weight_norm
|
385
370
|
dsc = lambda x: int(x * discriminator_channel_multi)
|
386
371
|
self.convs = nn.ModuleList(
|
387
372
|
[
|
388
|
-
norm_f(nn.Conv1d(1, dsc(
|
389
|
-
norm_f(
|
390
|
-
|
391
|
-
),
|
392
|
-
norm_f(
|
393
|
-
|
394
|
-
),
|
395
|
-
norm_f(
|
396
|
-
nn.Conv1d(dsc(256), dsc(512), 41, stride=4, groups=16, padding=20)
|
397
|
-
),
|
398
|
-
norm_f(
|
399
|
-
nn.Conv1d(dsc(512), dsc(512), 41, stride=4, groups=16, padding=20)
|
400
|
-
),
|
401
|
-
norm_f(nn.Conv1d(dsc(512), dsc(512), 5, stride=1, padding=2)),
|
373
|
+
norm_f(nn.Conv1d(1, dsc(128), 15, 1, padding=7)),
|
374
|
+
norm_f(nn.Conv1d(dsc(128), dsc(128), 41, 2, groups=4, padding=20)),
|
375
|
+
norm_f(nn.Conv1d(dsc(128), dsc(256), 41, 2, groups=16, padding=20)),
|
376
|
+
norm_f(nn.Conv1d(dsc(256), dsc(512), 41, 4, groups=16, padding=20)),
|
377
|
+
norm_f(nn.Conv1d(dsc(512), dsc(1024), 41, 4, groups=16, padding=20)),
|
378
|
+
norm_f(nn.Conv1d(dsc(1024), dsc(1024), 41, 1, groups=16, padding=20)),
|
379
|
+
norm_f(nn.Conv1d(dsc(1024), dsc(1024), 5, 1, padding=2)),
|
402
380
|
]
|
403
381
|
)
|
404
|
-
self.conv_post = norm_f(nn.Conv1d(dsc(
|
382
|
+
self.conv_post = norm_f(nn.Conv1d(dsc(1024), 1, 3, 1, padding=1))
|
383
|
+
self.envelope = Envelope(max_freq=self.max_freq, sample_rate=sample_rate)
|
405
384
|
self.activation = nn.LeakyReLU(0.1)
|
406
385
|
|
407
|
-
def forward(self, x):
|
408
|
-
# Input: raw audio (B, 1, T)
|
409
|
-
x = self.extractor(x)
|
386
|
+
def forward(self, x: Tensor):
|
410
387
|
fmap = []
|
411
|
-
for
|
412
|
-
x = self.
|
388
|
+
for l in self.convs:
|
389
|
+
x = self.envelope(x)
|
390
|
+
x = self.activation(l(x))
|
413
391
|
fmap.append(x)
|
414
392
|
x = self.conv_post(x)
|
415
393
|
fmap.append(x)
|
416
|
-
return x.flatten(1), fmap
|
394
|
+
return x.flatten(start_dim=1, end_dim=-1), fmap
|
417
395
|
|
418
396
|
|
419
397
|
class MultiEnvelopeDiscriminator(_MultiDiscriminatorT):
|
398
|
+
"""Modified from: https://github.com/dinhoitt/BemaGANv2/blob/9560ae9df153c956f259c261c57c4f84f89e3d72/models.py"""
|
399
|
+
|
420
400
|
def __init__(
|
421
401
|
self,
|
422
|
-
use_spectral_norm: bool = False,
|
423
402
|
discriminator_channel_multi: Number = 1,
|
424
403
|
):
|
425
404
|
super().__init__()
|
405
|
+
f_times_values = [-1, 0, 1, 300, 500]
|
426
406
|
self.discriminators = nn.ModuleList(
|
427
|
-
[
|
428
|
-
DiscriminatorEnvelope(
|
429
|
-
use_spectral_norm, discriminator_channel_multi
|
430
|
-
), # raw envelope
|
431
|
-
DiscriminatorEnvelope(use_spectral_norm), # downsampled once
|
432
|
-
DiscriminatorEnvelope(use_spectral_norm), # downsampled twice
|
433
|
-
]
|
434
|
-
)
|
435
|
-
self.meanpools = nn.ModuleList(
|
436
|
-
[nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
|
407
|
+
[DiscriminatorE(f, discriminator_channel_multi) for f in f_times_values]
|
437
408
|
)
|
438
409
|
|
439
410
|
def forward(self, y, y_hat):
|
440
|
-
y_d_rs
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
y_hat = self.meanpools[i - 1](y_hat)
|
411
|
+
y_d_rs = []
|
412
|
+
y_d_gs = []
|
413
|
+
fmap_rs = []
|
414
|
+
fmap_gs = []
|
415
|
+
for d in self.discriminators:
|
446
416
|
y_d_r, fmap_r = d(y)
|
447
417
|
y_d_g, fmap_g = d(y_hat)
|
448
418
|
y_d_rs.append(y_d_r)
|
449
|
-
y_d_gs.append(y_d_g)
|
450
419
|
fmap_rs.append(fmap_r)
|
420
|
+
y_d_gs.append(y_d_g)
|
451
421
|
fmap_gs.append(fmap_g)
|
452
422
|
|
453
423
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
lt_tensor/noise_tools.py
CHANGED
@@ -13,6 +13,7 @@ __all__ = [
|
|
13
13
|
]
|
14
14
|
|
15
15
|
from lt_utils.common import *
|
16
|
+
from lt_tensor.model_base import Model
|
16
17
|
import torch.nn.functional as F
|
17
18
|
from lt_tensor.torch_commons import *
|
18
19
|
import math
|
@@ -20,17 +21,17 @@ import random
|
|
20
21
|
from lt_tensor.misc_utils import set_seed
|
21
22
|
|
22
23
|
|
23
|
-
def add_gaussian_noise(x: Tensor, noise_level=0.025):
|
24
|
+
def add_gaussian_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
|
24
25
|
noise = torch.randn_like(x) * noise_level
|
25
26
|
return x + noise
|
26
27
|
|
27
28
|
|
28
|
-
def add_uniform_noise(x: Tensor, noise_level=0.025):
|
29
|
+
def add_uniform_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
|
29
30
|
noise = (torch.rand_like(x) - 0.5) * 2 * noise_level
|
30
31
|
return x + noise
|
31
32
|
|
32
33
|
|
33
|
-
def add_linear_noise(x, noise_level=0.05):
|
34
|
+
def add_linear_noise(x, noise_level=0.05) -> Tensor:
|
34
35
|
T = x.shape[-1]
|
35
36
|
ramp = torch.linspace(0, noise_level, T, device=x.device)
|
36
37
|
for _ in range(x.dim() - 1):
|
@@ -38,7 +39,7 @@ def add_linear_noise(x, noise_level=0.05):
|
|
38
39
|
return x + ramp.expand_as(x)
|
39
40
|
|
40
41
|
|
41
|
-
def add_impulse_noise(x: Tensor, noise_level=0.025):
|
42
|
+
def add_impulse_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
|
42
43
|
# For image inputs
|
43
44
|
probs = torch.rand_like(x)
|
44
45
|
x_clone = x.detach().clone()
|
@@ -47,7 +48,7 @@ def add_impulse_noise(x: Tensor, noise_level=0.025):
|
|
47
48
|
return x_clone
|
48
49
|
|
49
50
|
|
50
|
-
def add_pink_noise(x: Tensor, noise_level=0.05):
|
51
|
+
def add_pink_noise(x: Tensor, noise_level: float = 0.05) -> Tensor:
|
51
52
|
# pink noise: divide freq spectrum by sqrt(f)
|
52
53
|
if x.ndim == 3:
|
53
54
|
x = x.view(-1, x.shape[-1]) # flatten to 2D [B*M, T]
|
@@ -66,12 +67,12 @@ def add_pink_noise(x: Tensor, noise_level=0.05):
|
|
66
67
|
return x + pink_noised * noise_level
|
67
68
|
|
68
69
|
|
69
|
-
def add_clipped_gaussian_noise(x, noise_level=0.025):
|
70
|
+
def add_clipped_gaussian_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
|
70
71
|
noise = torch.randn_like(x) * noise_level
|
71
72
|
return torch.clamp(x + noise, 0.0, 1.0)
|
72
73
|
|
73
74
|
|
74
|
-
def add_multiplicative_noise(x, noise_level=0.025):
|
75
|
+
def add_multiplicative_noise(x: Tensor, noise_level: float = 0.025) -> Tensor:
|
75
76
|
noise = 1 + torch.randn_like(x) * noise_level
|
76
77
|
return x * noise
|
77
78
|
|
@@ -109,7 +110,15 @@ _NOISE_DIM_SUPPORT = {
|
|
109
110
|
|
110
111
|
def apply_noise(
|
111
112
|
x: Tensor,
|
112
|
-
noise_type:
|
113
|
+
noise_type: Literal[
|
114
|
+
"gaussian",
|
115
|
+
"uniform",
|
116
|
+
"linear",
|
117
|
+
"impulse",
|
118
|
+
"pink",
|
119
|
+
"clipped_gaussian",
|
120
|
+
"multiplicative",
|
121
|
+
] = "gaussian",
|
113
122
|
noise_level: float = 0.01,
|
114
123
|
seed: Optional[int] = None,
|
115
124
|
on_error: Literal["raise", "try_others", "return_unchanged"] = "raise",
|
@@ -229,11 +238,11 @@ class NoiseSchedulerA(nn.Module):
|
|
229
238
|
return collected, noise_history
|
230
239
|
|
231
240
|
|
232
|
-
class NoiseSchedulerB(
|
233
|
-
def __init__(self, timesteps: int =
|
241
|
+
class NoiseSchedulerB(Model):
|
242
|
+
def __init__(self, timesteps: int = 50, l_min: float = 0.0005, l_max: float = 0.05):
|
234
243
|
super().__init__()
|
235
244
|
|
236
|
-
betas = torch.linspace(
|
245
|
+
betas = torch.linspace(l_min, l_max, timesteps)
|
237
246
|
alphas = 1.0 - betas
|
238
247
|
alpha_cumprod = torch.cumprod(alphas, dim=0)
|
239
248
|
|
@@ -272,7 +281,7 @@ class NoiseSchedulerB(nn.Module):
|
|
272
281
|
self, x_0: Tensor, t: int, noise: Optional[Union[Tensor, float]] = None
|
273
282
|
) -> Tensor:
|
274
283
|
assert (
|
275
|
-
|
284
|
+
0 <= t < self.timesteps
|
276
285
|
), f"Time step t={t} is out of bounds for scheduler with {self.timesteps} steps."
|
277
286
|
|
278
287
|
if noise is None:
|
@@ -286,7 +295,7 @@ class NoiseSchedulerB(nn.Module):
|
|
286
295
|
return alpha_term + noise_term
|
287
296
|
|
288
297
|
|
289
|
-
class NoiseSchedulerC(
|
298
|
+
class NoiseSchedulerC(Model):
|
290
299
|
def __init__(self, timesteps: int = 512):
|
291
300
|
super().__init__()
|
292
301
|
|