lt-tensor 0.0.1a15__py3-none-any.whl → 0.0.1a16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lt_tensor/model_zoo/__init__.py +8 -6
- lt_tensor/model_zoo/audio_models/__init__.py +1 -0
- lt_tensor/model_zoo/audio_models/diffwave/__init__.py +3 -0
- lt_tensor/model_zoo/{diffwave → audio_models/diffwave}/model.py +1 -0
- lt_tensor/model_zoo/audio_models/hifigan/__init__.py +393 -0
- lt_tensor/model_zoo/audio_models/istft/__init__.py +409 -0
- lt_tensor/model_zoo/basic.py +139 -0
- {lt_tensor-0.0.1a15.dist-info → lt_tensor-0.0.1a16.dist-info}/METADATA +1 -1
- {lt_tensor-0.0.1a15.dist-info → lt_tensor-0.0.1a16.dist-info}/RECORD +12 -15
- lt_tensor/model_zoo/diffwave/__init__.py +0 -0
- lt_tensor/model_zoo/diffwave/params.py +0 -58
- lt_tensor/model_zoo/discriminator.py +0 -314
- lt_tensor/model_zoo/istft/__init__.py +0 -5
- lt_tensor/model_zoo/istft/generator.py +0 -94
- lt_tensor/model_zoo/istft/sg.py +0 -142
- lt_tensor/model_zoo/istft/trainer.py +0 -627
- {lt_tensor-0.0.1a15.dist-info → lt_tensor-0.0.1a16.dist-info}/WHEEL +0 -0
- {lt_tensor-0.0.1a15.dist-info → lt_tensor-0.0.1a16.dist-info}/licenses/LICENSE +0 -0
- {lt_tensor-0.0.1a15.dist-info → lt_tensor-0.0.1a16.dist-info}/top_level.txt +0 -0
@@ -1,314 +0,0 @@
|
|
1
|
-
from lt_tensor.torch_commons import *
|
2
|
-
import torch.nn.functional as F
|
3
|
-
from lt_tensor.model_base import Model
|
4
|
-
from lt_utils.common import *
|
5
|
-
from einops import rearrange
|
6
|
-
import torchaudio
|
7
|
-
|
8
|
-
|
9
|
-
def get_padding(ks, d):
|
10
|
-
return int((ks * d - d) / 2)
|
11
|
-
|
12
|
-
|
13
|
-
class DiscriminatorP(Model):
|
14
|
-
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
15
|
-
super().__init__()
|
16
|
-
self.period = period
|
17
|
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
18
|
-
self.convs = nn.ModuleList(
|
19
|
-
[
|
20
|
-
norm_f(
|
21
|
-
nn.Conv2d(
|
22
|
-
1,
|
23
|
-
32,
|
24
|
-
(kernel_size, 1),
|
25
|
-
(stride, 1),
|
26
|
-
padding=(get_padding(5, 1), 0),
|
27
|
-
)
|
28
|
-
),
|
29
|
-
norm_f(
|
30
|
-
nn.Conv2d(
|
31
|
-
32,
|
32
|
-
128,
|
33
|
-
(kernel_size, 1),
|
34
|
-
(stride, 1),
|
35
|
-
padding=(get_padding(5, 1), 0),
|
36
|
-
)
|
37
|
-
),
|
38
|
-
norm_f(
|
39
|
-
nn.Conv2d(
|
40
|
-
128,
|
41
|
-
512,
|
42
|
-
(kernel_size, 1),
|
43
|
-
(stride, 1),
|
44
|
-
padding=(get_padding(5, 1), 0),
|
45
|
-
)
|
46
|
-
),
|
47
|
-
norm_f(
|
48
|
-
nn.Conv2d(
|
49
|
-
512,
|
50
|
-
1024,
|
51
|
-
(kernel_size, 1),
|
52
|
-
(stride, 1),
|
53
|
-
padding=(get_padding(5, 1), 0),
|
54
|
-
)
|
55
|
-
),
|
56
|
-
norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
57
|
-
]
|
58
|
-
)
|
59
|
-
self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
60
|
-
self.activation = nn.LeakyReLU(0.1)
|
61
|
-
|
62
|
-
def forward(self, x):
|
63
|
-
fmap = []
|
64
|
-
|
65
|
-
# 1d to 2d
|
66
|
-
b, c, t = x.shape
|
67
|
-
if t % self.period != 0: # pad first
|
68
|
-
n_pad = self.period - (t % self.period)
|
69
|
-
x = F.pad(x, (0, n_pad), "reflect")
|
70
|
-
t = t + n_pad
|
71
|
-
x = x.view(b, c, t // self.period, self.period)
|
72
|
-
|
73
|
-
for l in self.convs:
|
74
|
-
x = l(x)
|
75
|
-
x = self.activation(x)
|
76
|
-
fmap.append(x)
|
77
|
-
x = self.conv_post(x)
|
78
|
-
fmap.append(x)
|
79
|
-
x = torch.flatten(x, 1, -1)
|
80
|
-
|
81
|
-
return x, fmap
|
82
|
-
|
83
|
-
|
84
|
-
class MultiPeriodDiscriminator(Model):
|
85
|
-
def __init__(self):
|
86
|
-
super().__init__()
|
87
|
-
self.discriminators = nn.ModuleList(
|
88
|
-
[
|
89
|
-
DiscriminatorP(2),
|
90
|
-
DiscriminatorP(3),
|
91
|
-
DiscriminatorP(5),
|
92
|
-
DiscriminatorP(7),
|
93
|
-
DiscriminatorP(11),
|
94
|
-
]
|
95
|
-
)
|
96
|
-
|
97
|
-
def forward(self, y, y_hat):
|
98
|
-
y_d_rs = []
|
99
|
-
y_d_gs = []
|
100
|
-
fmap_rs = []
|
101
|
-
fmap_gs = []
|
102
|
-
for i, d in enumerate(self.discriminators):
|
103
|
-
y_d_r, fmap_r = d(y)
|
104
|
-
y_d_g, fmap_g = d(y_hat)
|
105
|
-
y_d_rs.append(y_d_r)
|
106
|
-
fmap_rs.append(fmap_r)
|
107
|
-
y_d_gs.append(y_d_g)
|
108
|
-
fmap_gs.append(fmap_g)
|
109
|
-
|
110
|
-
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
111
|
-
|
112
|
-
|
113
|
-
class DiscriminatorS(Model):
|
114
|
-
def __init__(self, use_spectral_norm=False):
|
115
|
-
super().__init__()
|
116
|
-
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
117
|
-
self.convs = nn.ModuleList(
|
118
|
-
[
|
119
|
-
norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)),
|
120
|
-
norm_f(nn.Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
121
|
-
norm_f(nn.Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
122
|
-
norm_f(nn.Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
123
|
-
norm_f(nn.Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
124
|
-
norm_f(nn.Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
125
|
-
norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
|
126
|
-
]
|
127
|
-
)
|
128
|
-
self.activation = nn.LeakyReLU(0.1)
|
129
|
-
self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
|
130
|
-
|
131
|
-
def forward(self, x):
|
132
|
-
fmap = []
|
133
|
-
for l in self.convs:
|
134
|
-
x = l(x)
|
135
|
-
x = self.activation(x)
|
136
|
-
fmap.append(x)
|
137
|
-
x = self.conv_post(x)
|
138
|
-
fmap.append(x)
|
139
|
-
x = torch.flatten(x, 1, -1)
|
140
|
-
|
141
|
-
return x, fmap
|
142
|
-
|
143
|
-
|
144
|
-
class MultiScaleDiscriminator(Model):
|
145
|
-
def __init__(self):
|
146
|
-
super().__init__()
|
147
|
-
self.discriminators = nn.ModuleList(
|
148
|
-
[
|
149
|
-
DiscriminatorS(use_spectral_norm=True),
|
150
|
-
DiscriminatorS(),
|
151
|
-
DiscriminatorS(),
|
152
|
-
]
|
153
|
-
)
|
154
|
-
self.meanpools = nn.ModuleList(
|
155
|
-
[nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
|
156
|
-
)
|
157
|
-
|
158
|
-
def forward(self, y, y_hat):
|
159
|
-
y_d_rs = []
|
160
|
-
y_d_gs = []
|
161
|
-
fmap_rs = []
|
162
|
-
fmap_gs = []
|
163
|
-
for i, d in enumerate(self.discriminators):
|
164
|
-
if i != 0:
|
165
|
-
y = self.meanpools[i - 1](y)
|
166
|
-
y_hat = self.meanpools[i - 1](y_hat)
|
167
|
-
y_d_r, fmap_r = d(y)
|
168
|
-
y_d_g, fmap_g = d(y_hat)
|
169
|
-
y_d_rs.append(y_d_r)
|
170
|
-
fmap_rs.append(fmap_r)
|
171
|
-
y_d_gs.append(y_d_g)
|
172
|
-
fmap_gs.append(fmap_g)
|
173
|
-
|
174
|
-
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
175
|
-
|
176
|
-
|
177
|
-
class MultiResolutionDiscriminator(Model):
|
178
|
-
"""Source: https://github.com/gemelo-ai/vocos/blob/main/vocos/discriminators.py"""
|
179
|
-
|
180
|
-
def __init__(
|
181
|
-
self,
|
182
|
-
fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
|
183
|
-
num_embeddings: Optional[int] = None,
|
184
|
-
):
|
185
|
-
"""
|
186
|
-
|
187
|
-
Args:
|
188
|
-
fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
|
189
|
-
num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
|
190
|
-
Defaults to None.
|
191
|
-
"""
|
192
|
-
|
193
|
-
super().__init__()
|
194
|
-
self.discriminators = nn.ModuleList(
|
195
|
-
[
|
196
|
-
DiscriminatorR(window_length=w, num_embeddings=num_embeddings)
|
197
|
-
for w in fft_sizes
|
198
|
-
]
|
199
|
-
)
|
200
|
-
|
201
|
-
def forward(
|
202
|
-
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
203
|
-
) -> Tuple[
|
204
|
-
List[torch.Tensor],
|
205
|
-
List[torch.Tensor],
|
206
|
-
List[List[torch.Tensor]],
|
207
|
-
List[List[torch.Tensor]],
|
208
|
-
]:
|
209
|
-
y_d_rs = []
|
210
|
-
y_d_gs = []
|
211
|
-
fmap_rs = []
|
212
|
-
fmap_gs = []
|
213
|
-
|
214
|
-
for d in self.discriminators:
|
215
|
-
y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
|
216
|
-
y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
|
217
|
-
y_d_rs.append(y_d_r)
|
218
|
-
fmap_rs.append(fmap_r)
|
219
|
-
y_d_gs.append(y_d_g)
|
220
|
-
fmap_gs.append(fmap_g)
|
221
|
-
|
222
|
-
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
223
|
-
|
224
|
-
|
225
|
-
class DiscriminatorR(Model):
|
226
|
-
def __init__(
|
227
|
-
self,
|
228
|
-
window_length: int,
|
229
|
-
num_embeddings: Optional[int] = None,
|
230
|
-
channels: int = 32,
|
231
|
-
hop_factor: float = 0.25,
|
232
|
-
bands: Tuple[Tuple[float, float], ...] = (
|
233
|
-
(0.0, 0.1),
|
234
|
-
(0.1, 0.25),
|
235
|
-
(0.25, 0.5),
|
236
|
-
(0.5, 0.75),
|
237
|
-
(0.75, 1.0),
|
238
|
-
),
|
239
|
-
):
|
240
|
-
super().__init__()
|
241
|
-
self.window_length = window_length
|
242
|
-
self.hop_factor = hop_factor
|
243
|
-
self.spec_fn = torchaudio.transforms.Spectrogram(
|
244
|
-
n_fft=window_length,
|
245
|
-
hop_length=int(window_length * hop_factor),
|
246
|
-
win_length=window_length,
|
247
|
-
power=None,
|
248
|
-
)
|
249
|
-
n_fft = window_length // 2 + 1
|
250
|
-
bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
|
251
|
-
self.bands = bands
|
252
|
-
convs = lambda: nn.ModuleList(
|
253
|
-
[
|
254
|
-
weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
|
255
|
-
weight_norm(
|
256
|
-
nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
|
257
|
-
),
|
258
|
-
weight_norm(
|
259
|
-
nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
|
260
|
-
),
|
261
|
-
weight_norm(
|
262
|
-
nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
|
263
|
-
),
|
264
|
-
weight_norm(
|
265
|
-
nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))
|
266
|
-
),
|
267
|
-
]
|
268
|
-
)
|
269
|
-
self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
|
270
|
-
|
271
|
-
if num_embeddings is not None:
|
272
|
-
self.emb = torch.nn.Embedding(
|
273
|
-
num_embeddings=num_embeddings, embedding_dim=channels
|
274
|
-
)
|
275
|
-
torch.nn.init.zeros_(self.emb.weight)
|
276
|
-
|
277
|
-
self.conv_post = weight_norm(
|
278
|
-
nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))
|
279
|
-
)
|
280
|
-
|
281
|
-
def spectrogram(self, x):
|
282
|
-
# Remove DC offset
|
283
|
-
x = x - x.mean(dim=-1, keepdims=True)
|
284
|
-
# Peak normalize the volume of input audio
|
285
|
-
x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
|
286
|
-
x = self.spec_fn(x)
|
287
|
-
x = torch.view_as_real(x)
|
288
|
-
x = rearrange(x, "b f t c -> b c t f")
|
289
|
-
# Split into bands
|
290
|
-
x_bands = [x[..., b[0] : b[1]] for b in self.bands]
|
291
|
-
return x_bands
|
292
|
-
|
293
|
-
def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
|
294
|
-
x_bands = self.spectrogram(x)
|
295
|
-
fmap = []
|
296
|
-
x = []
|
297
|
-
for band, stack in zip(x_bands, self.band_convs):
|
298
|
-
for i, layer in enumerate(stack):
|
299
|
-
band = layer(band)
|
300
|
-
band = torch.nn.functional.leaky_relu(band, 0.1)
|
301
|
-
if i > 0:
|
302
|
-
fmap.append(band)
|
303
|
-
x.append(band)
|
304
|
-
x = torch.cat(x, dim=-1)
|
305
|
-
if cond_embedding_id is not None:
|
306
|
-
emb = self.emb(cond_embedding_id)
|
307
|
-
h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
|
308
|
-
else:
|
309
|
-
h = 0
|
310
|
-
x = self.conv_post(x)
|
311
|
-
fmap.append(x)
|
312
|
-
x += h
|
313
|
-
|
314
|
-
return x, fmap
|
@@ -1,94 +0,0 @@
|
|
1
|
-
__all__ = ["iSTFTGenerator"]
|
2
|
-
from lt_utils.common import *
|
3
|
-
from lt_tensor.torch_commons import *
|
4
|
-
from lt_tensor.model_zoo.residual import ConvNets, ResBlocks1D, ResBlock1D, ResBlock1D2
|
5
|
-
|
6
|
-
|
7
|
-
class iSTFTGenerator(ConvNets):
|
8
|
-
def __init__(
|
9
|
-
self,
|
10
|
-
in_channels: int = 80,
|
11
|
-
upsample_rates: List[Union[int, List[int]]] = [8, 8],
|
12
|
-
upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16],
|
13
|
-
upsample_initial_channel: int = 512,
|
14
|
-
resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
|
15
|
-
resblock_dilation_sizes: List[Union[int, List[int]]] = [
|
16
|
-
[1, 3, 5],
|
17
|
-
[1, 3, 5],
|
18
|
-
[1, 3, 5],
|
19
|
-
],
|
20
|
-
n_fft: int = 16,
|
21
|
-
activation: nn.Module = nn.LeakyReLU(0.1),
|
22
|
-
hop_length: int = 256,
|
23
|
-
residual_cls: Union[ResBlock1D, ResBlock1D2] = ResBlock1D
|
24
|
-
):
|
25
|
-
super().__init__()
|
26
|
-
self.num_kernels = len(resblock_kernel_sizes)
|
27
|
-
self.num_upsamples = len(upsample_rates)
|
28
|
-
self.hop_length = hop_length
|
29
|
-
self.conv_pre = weight_norm(
|
30
|
-
nn.Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)
|
31
|
-
)
|
32
|
-
self.blocks = nn.ModuleList()
|
33
|
-
self.activation = activation
|
34
|
-
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
35
|
-
self.blocks.append(
|
36
|
-
self._make_blocks(
|
37
|
-
(i, k, u),
|
38
|
-
upsample_initial_channel,
|
39
|
-
resblock_kernel_sizes,
|
40
|
-
resblock_dilation_sizes,
|
41
|
-
residual_cls
|
42
|
-
)
|
43
|
-
)
|
44
|
-
|
45
|
-
ch = upsample_initial_channel // (2 ** (i + 1))
|
46
|
-
self.post_n_fft = n_fft // 2 + 1
|
47
|
-
self.conv_post = weight_norm(nn.Conv1d(ch, n_fft + 2, 7, 1, padding=3))
|
48
|
-
self.conv_post.apply(self.init_weights)
|
49
|
-
self.reflection_pad = nn.ReflectionPad1d((1, 0))
|
50
|
-
|
51
|
-
def _make_blocks(
|
52
|
-
self,
|
53
|
-
state: Tuple[int, int, int],
|
54
|
-
upsample_initial_channel: int,
|
55
|
-
resblock_kernel_sizes: List[Union[int, List[int]]],
|
56
|
-
resblock_dilation_sizes: List[int | List[int]],
|
57
|
-
residual: nn.Module
|
58
|
-
):
|
59
|
-
i, k, u = state
|
60
|
-
channels = upsample_initial_channel // (2 ** (i + 1))
|
61
|
-
return nn.ModuleDict(
|
62
|
-
dict(
|
63
|
-
up=nn.Sequential(
|
64
|
-
self.activation,
|
65
|
-
weight_norm(
|
66
|
-
nn.ConvTranspose1d(
|
67
|
-
upsample_initial_channel // (2**i),
|
68
|
-
channels,
|
69
|
-
k,
|
70
|
-
u,
|
71
|
-
padding=(k - u) // 2,
|
72
|
-
)
|
73
|
-
).apply(self.init_weights),
|
74
|
-
),
|
75
|
-
residual=ResBlocks1D(
|
76
|
-
channels,
|
77
|
-
resblock_kernel_sizes,
|
78
|
-
resblock_dilation_sizes,
|
79
|
-
self.activation,
|
80
|
-
residual
|
81
|
-
),
|
82
|
-
)
|
83
|
-
)
|
84
|
-
|
85
|
-
def forward(self, x):
|
86
|
-
x = self.conv_pre(x)
|
87
|
-
for block in self.blocks:
|
88
|
-
x = block["up"](x)
|
89
|
-
x = block["residual"](x)
|
90
|
-
|
91
|
-
x = self.conv_post(self.activation(self.reflection_pad(x)))
|
92
|
-
spec = torch.exp(x[:, : self.post_n_fft, :])
|
93
|
-
phase = torch.sin(x[:, self.post_n_fft :, :])
|
94
|
-
return spec, phase
|
lt_tensor/model_zoo/istft/sg.py
DELETED
@@ -1,142 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
import torch.nn as nn
|
3
|
-
import math
|
4
|
-
from einops import repeat
|
5
|
-
|
6
|
-
|
7
|
-
class SineGen(nn.Module):
|
8
|
-
def __init__(
|
9
|
-
self,
|
10
|
-
samp_rate,
|
11
|
-
upsample_scale,
|
12
|
-
harmonic_num=0,
|
13
|
-
sine_amp=0.1,
|
14
|
-
noise_std=0.003,
|
15
|
-
voiced_threshold=0,
|
16
|
-
flag_for_pulse=False,
|
17
|
-
):
|
18
|
-
super().__init__()
|
19
|
-
self.sampling_rate = samp_rate
|
20
|
-
self.upsample_scale = upsample_scale
|
21
|
-
self.harmonic_num = harmonic_num
|
22
|
-
self.sine_amp = sine_amp
|
23
|
-
self.noise_std = noise_std
|
24
|
-
self.voiced_threshold = voiced_threshold
|
25
|
-
self.flag_for_pulse = flag_for_pulse
|
26
|
-
self.dim = self.harmonic_num + 1 # fundamental + harmonics
|
27
|
-
|
28
|
-
def _f02uv_b(self, f0):
|
29
|
-
return (f0 > self.voiced_threshold).float() # [B, T]
|
30
|
-
|
31
|
-
def _f02uv(self, f0):
|
32
|
-
return (f0 > self.voiced_threshold).float().unsqueeze(-1) # -> (B, T, 1)
|
33
|
-
|
34
|
-
@torch.no_grad()
|
35
|
-
def _f02sine(self, f0_values):
|
36
|
-
"""
|
37
|
-
f0_values: (B, T, 1)
|
38
|
-
Output: sine waves (B, T * upsample, dim)
|
39
|
-
"""
|
40
|
-
B, T, _ = f0_values.size()
|
41
|
-
f0_upsampled = repeat(
|
42
|
-
f0_values, "b t d -> b (t r) d", r=self.upsample_scale
|
43
|
-
) # (B, T_up, 1)
|
44
|
-
|
45
|
-
# Create harmonics
|
46
|
-
harmonics = (
|
47
|
-
torch.arange(1, self.dim + 1, device=f0_values.device)
|
48
|
-
.float()
|
49
|
-
.view(1, 1, -1)
|
50
|
-
)
|
51
|
-
f0_harm = f0_upsampled * harmonics # (B, T_up, dim)
|
52
|
-
|
53
|
-
# Convert Hz to radians (2πf/sr), then integrate to get phase
|
54
|
-
rad_values = f0_harm / self.sampling_rate # normalized freq
|
55
|
-
rad_values = rad_values % 1.0 # remove multiples of 2π
|
56
|
-
|
57
|
-
# Random initial phase for each harmonic (except 0th if pulse mode)
|
58
|
-
if self.flag_for_pulse:
|
59
|
-
rand_ini = torch.zeros((B, 1, self.dim), device=f0_values.device)
|
60
|
-
else:
|
61
|
-
rand_ini = torch.rand((B, 1, self.dim), device=f0_values.device)
|
62
|
-
|
63
|
-
rand_ini = rand_ini * 2 * math.pi
|
64
|
-
|
65
|
-
# Compute cumulative phase
|
66
|
-
rad_values = rad_values * 2 * math.pi
|
67
|
-
phase = torch.cumsum(rad_values, dim=1) + rand_ini # (B, T_up, dim)
|
68
|
-
|
69
|
-
sine_waves = torch.sin(phase) # (B, T_up, dim)
|
70
|
-
return sine_waves
|
71
|
-
|
72
|
-
def _forward(self, f0):
|
73
|
-
"""
|
74
|
-
f0: (B, T, 1)
|
75
|
-
returns: sine signal with harmonics and noise added
|
76
|
-
"""
|
77
|
-
sine_waves = self._f02sine(f0) # (B, T_up, dim)
|
78
|
-
uv = self._f02uv_b(f0) # (B, T, 1)
|
79
|
-
uv = repeat(uv, "b t d -> b (t r) d", r=self.upsample_scale) # (B, T_up, 1)
|
80
|
-
|
81
|
-
# voiced sine + unvoiced noise
|
82
|
-
sine_signal = self.sine_amp * sine_waves * uv # (B, T_up, dim)
|
83
|
-
noise = torch.randn_like(sine_signal) * self.noise_std
|
84
|
-
output = sine_signal + noise * (1.0 - uv) # noise added only on unvoiced
|
85
|
-
|
86
|
-
return output # (B, T_up, dim)
|
87
|
-
|
88
|
-
def forward(self, f0):
|
89
|
-
"""
|
90
|
-
Args:
|
91
|
-
f0: (B, T) in Hz (before upsampling)
|
92
|
-
Returns:
|
93
|
-
sine_waves: (B, T_up, dim)
|
94
|
-
uv: (B, T_up, 1)
|
95
|
-
noise: (B, T_up, 1)
|
96
|
-
"""
|
97
|
-
B, T = f0.shape
|
98
|
-
device = f0.device
|
99
|
-
|
100
|
-
# Get uv mask (before upsampling)
|
101
|
-
uv = self._f02uv(f0) # (B, T, 1)
|
102
|
-
|
103
|
-
# Expand f0 to include harmonics: (B, T, dim)
|
104
|
-
f0 = f0.unsqueeze(-1) # (B, T, 1)
|
105
|
-
harmonics = (
|
106
|
-
torch.arange(1, self.dim + 1, device=device).float().view(1, 1, -1)
|
107
|
-
) # (1, 1, dim)
|
108
|
-
f0_harm = f0 * harmonics # (B, T, dim)
|
109
|
-
|
110
|
-
# Upsample
|
111
|
-
f0_harm_up = repeat(
|
112
|
-
f0_harm, "b t d -> b (t r) d", r=self.upsample_scale
|
113
|
-
) # (B, T_up, dim)
|
114
|
-
uv_up = repeat(uv, "b t d -> b (t r) d", r=self.upsample_scale) # (B, T_up, 1)
|
115
|
-
|
116
|
-
# Convert to radians
|
117
|
-
rad_per_sample = f0_harm_up / self.sampling_rate # Hz → cycles/sample
|
118
|
-
rad_per_sample = rad_per_sample * 2 * math.pi # cycles → radians/sample
|
119
|
-
|
120
|
-
# Random phase init for each sample
|
121
|
-
B, T_up, D = rad_per_sample.shape
|
122
|
-
rand_phase = torch.rand(B, D, device=device) * 2 * math.pi # (B, D)
|
123
|
-
|
124
|
-
# Compute cumulative phase
|
125
|
-
phase = torch.cumsum(rad_per_sample, dim=1) + rand_phase.unsqueeze(
|
126
|
-
1
|
127
|
-
) # (B, T_up, D)
|
128
|
-
|
129
|
-
# Apply sine
|
130
|
-
sine_waves = torch.sin(phase) * self.sine_amp # (B, T_up, D)
|
131
|
-
|
132
|
-
# Handle unvoiced: create noise only for fundamental
|
133
|
-
noise = torch.randn(B, T_up, 1, device=device) * self.noise_std
|
134
|
-
if self.flag_for_pulse:
|
135
|
-
# If pulse mode is on, align phase at start of voiced segments
|
136
|
-
# Optional and tricky to implement — may require segmenting uv
|
137
|
-
pass
|
138
|
-
|
139
|
-
# Replace sine by noise for unvoiced (only on fundamental)
|
140
|
-
sine_waves[:, :, 0:1] = sine_waves[:, :, 0:1] * uv_up + noise * (1 - uv_up)
|
141
|
-
|
142
|
-
return sine_waves, uv_up, noise
|