lt-tensor 0.0.1a15__py3-none-any.whl → 0.0.1a16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,314 +0,0 @@
1
- from lt_tensor.torch_commons import *
2
- import torch.nn.functional as F
3
- from lt_tensor.model_base import Model
4
- from lt_utils.common import *
5
- from einops import rearrange
6
- import torchaudio
7
-
8
-
9
- def get_padding(ks, d):
10
- return int((ks * d - d) / 2)
11
-
12
-
13
- class DiscriminatorP(Model):
14
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
15
- super().__init__()
16
- self.period = period
17
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
18
- self.convs = nn.ModuleList(
19
- [
20
- norm_f(
21
- nn.Conv2d(
22
- 1,
23
- 32,
24
- (kernel_size, 1),
25
- (stride, 1),
26
- padding=(get_padding(5, 1), 0),
27
- )
28
- ),
29
- norm_f(
30
- nn.Conv2d(
31
- 32,
32
- 128,
33
- (kernel_size, 1),
34
- (stride, 1),
35
- padding=(get_padding(5, 1), 0),
36
- )
37
- ),
38
- norm_f(
39
- nn.Conv2d(
40
- 128,
41
- 512,
42
- (kernel_size, 1),
43
- (stride, 1),
44
- padding=(get_padding(5, 1), 0),
45
- )
46
- ),
47
- norm_f(
48
- nn.Conv2d(
49
- 512,
50
- 1024,
51
- (kernel_size, 1),
52
- (stride, 1),
53
- padding=(get_padding(5, 1), 0),
54
- )
55
- ),
56
- norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
57
- ]
58
- )
59
- self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
60
- self.activation = nn.LeakyReLU(0.1)
61
-
62
- def forward(self, x):
63
- fmap = []
64
-
65
- # 1d to 2d
66
- b, c, t = x.shape
67
- if t % self.period != 0: # pad first
68
- n_pad = self.period - (t % self.period)
69
- x = F.pad(x, (0, n_pad), "reflect")
70
- t = t + n_pad
71
- x = x.view(b, c, t // self.period, self.period)
72
-
73
- for l in self.convs:
74
- x = l(x)
75
- x = self.activation(x)
76
- fmap.append(x)
77
- x = self.conv_post(x)
78
- fmap.append(x)
79
- x = torch.flatten(x, 1, -1)
80
-
81
- return x, fmap
82
-
83
-
84
- class MultiPeriodDiscriminator(Model):
85
- def __init__(self):
86
- super().__init__()
87
- self.discriminators = nn.ModuleList(
88
- [
89
- DiscriminatorP(2),
90
- DiscriminatorP(3),
91
- DiscriminatorP(5),
92
- DiscriminatorP(7),
93
- DiscriminatorP(11),
94
- ]
95
- )
96
-
97
- def forward(self, y, y_hat):
98
- y_d_rs = []
99
- y_d_gs = []
100
- fmap_rs = []
101
- fmap_gs = []
102
- for i, d in enumerate(self.discriminators):
103
- y_d_r, fmap_r = d(y)
104
- y_d_g, fmap_g = d(y_hat)
105
- y_d_rs.append(y_d_r)
106
- fmap_rs.append(fmap_r)
107
- y_d_gs.append(y_d_g)
108
- fmap_gs.append(fmap_g)
109
-
110
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
111
-
112
-
113
- class DiscriminatorS(Model):
114
- def __init__(self, use_spectral_norm=False):
115
- super().__init__()
116
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
117
- self.convs = nn.ModuleList(
118
- [
119
- norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)),
120
- norm_f(nn.Conv1d(128, 128, 41, 2, groups=4, padding=20)),
121
- norm_f(nn.Conv1d(128, 256, 41, 2, groups=16, padding=20)),
122
- norm_f(nn.Conv1d(256, 512, 41, 4, groups=16, padding=20)),
123
- norm_f(nn.Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
124
- norm_f(nn.Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
125
- norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
126
- ]
127
- )
128
- self.activation = nn.LeakyReLU(0.1)
129
- self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
130
-
131
- def forward(self, x):
132
- fmap = []
133
- for l in self.convs:
134
- x = l(x)
135
- x = self.activation(x)
136
- fmap.append(x)
137
- x = self.conv_post(x)
138
- fmap.append(x)
139
- x = torch.flatten(x, 1, -1)
140
-
141
- return x, fmap
142
-
143
-
144
- class MultiScaleDiscriminator(Model):
145
- def __init__(self):
146
- super().__init__()
147
- self.discriminators = nn.ModuleList(
148
- [
149
- DiscriminatorS(use_spectral_norm=True),
150
- DiscriminatorS(),
151
- DiscriminatorS(),
152
- ]
153
- )
154
- self.meanpools = nn.ModuleList(
155
- [nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
156
- )
157
-
158
- def forward(self, y, y_hat):
159
- y_d_rs = []
160
- y_d_gs = []
161
- fmap_rs = []
162
- fmap_gs = []
163
- for i, d in enumerate(self.discriminators):
164
- if i != 0:
165
- y = self.meanpools[i - 1](y)
166
- y_hat = self.meanpools[i - 1](y_hat)
167
- y_d_r, fmap_r = d(y)
168
- y_d_g, fmap_g = d(y_hat)
169
- y_d_rs.append(y_d_r)
170
- fmap_rs.append(fmap_r)
171
- y_d_gs.append(y_d_g)
172
- fmap_gs.append(fmap_g)
173
-
174
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
175
-
176
-
177
- class MultiResolutionDiscriminator(Model):
178
- """Source: https://github.com/gemelo-ai/vocos/blob/main/vocos/discriminators.py"""
179
-
180
- def __init__(
181
- self,
182
- fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
183
- num_embeddings: Optional[int] = None,
184
- ):
185
- """
186
-
187
- Args:
188
- fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
189
- num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
190
- Defaults to None.
191
- """
192
-
193
- super().__init__()
194
- self.discriminators = nn.ModuleList(
195
- [
196
- DiscriminatorR(window_length=w, num_embeddings=num_embeddings)
197
- for w in fft_sizes
198
- ]
199
- )
200
-
201
- def forward(
202
- self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
203
- ) -> Tuple[
204
- List[torch.Tensor],
205
- List[torch.Tensor],
206
- List[List[torch.Tensor]],
207
- List[List[torch.Tensor]],
208
- ]:
209
- y_d_rs = []
210
- y_d_gs = []
211
- fmap_rs = []
212
- fmap_gs = []
213
-
214
- for d in self.discriminators:
215
- y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
216
- y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
217
- y_d_rs.append(y_d_r)
218
- fmap_rs.append(fmap_r)
219
- y_d_gs.append(y_d_g)
220
- fmap_gs.append(fmap_g)
221
-
222
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
223
-
224
-
225
- class DiscriminatorR(Model):
226
- def __init__(
227
- self,
228
- window_length: int,
229
- num_embeddings: Optional[int] = None,
230
- channels: int = 32,
231
- hop_factor: float = 0.25,
232
- bands: Tuple[Tuple[float, float], ...] = (
233
- (0.0, 0.1),
234
- (0.1, 0.25),
235
- (0.25, 0.5),
236
- (0.5, 0.75),
237
- (0.75, 1.0),
238
- ),
239
- ):
240
- super().__init__()
241
- self.window_length = window_length
242
- self.hop_factor = hop_factor
243
- self.spec_fn = torchaudio.transforms.Spectrogram(
244
- n_fft=window_length,
245
- hop_length=int(window_length * hop_factor),
246
- win_length=window_length,
247
- power=None,
248
- )
249
- n_fft = window_length // 2 + 1
250
- bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
251
- self.bands = bands
252
- convs = lambda: nn.ModuleList(
253
- [
254
- weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
255
- weight_norm(
256
- nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
257
- ),
258
- weight_norm(
259
- nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
260
- ),
261
- weight_norm(
262
- nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
263
- ),
264
- weight_norm(
265
- nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))
266
- ),
267
- ]
268
- )
269
- self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
270
-
271
- if num_embeddings is not None:
272
- self.emb = torch.nn.Embedding(
273
- num_embeddings=num_embeddings, embedding_dim=channels
274
- )
275
- torch.nn.init.zeros_(self.emb.weight)
276
-
277
- self.conv_post = weight_norm(
278
- nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))
279
- )
280
-
281
- def spectrogram(self, x):
282
- # Remove DC offset
283
- x = x - x.mean(dim=-1, keepdims=True)
284
- # Peak normalize the volume of input audio
285
- x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
286
- x = self.spec_fn(x)
287
- x = torch.view_as_real(x)
288
- x = rearrange(x, "b f t c -> b c t f")
289
- # Split into bands
290
- x_bands = [x[..., b[0] : b[1]] for b in self.bands]
291
- return x_bands
292
-
293
- def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
294
- x_bands = self.spectrogram(x)
295
- fmap = []
296
- x = []
297
- for band, stack in zip(x_bands, self.band_convs):
298
- for i, layer in enumerate(stack):
299
- band = layer(band)
300
- band = torch.nn.functional.leaky_relu(band, 0.1)
301
- if i > 0:
302
- fmap.append(band)
303
- x.append(band)
304
- x = torch.cat(x, dim=-1)
305
- if cond_embedding_id is not None:
306
- emb = self.emb(cond_embedding_id)
307
- h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
308
- else:
309
- h = 0
310
- x = self.conv_post(x)
311
- fmap.append(x)
312
- x += h
313
-
314
- return x, fmap
@@ -1,5 +0,0 @@
1
- from .generator import iSTFTGenerator
2
- from . import trainer
3
-
4
-
5
- __all__ = ["iSTFTGenerator", "trainer"]
@@ -1,94 +0,0 @@
1
- __all__ = ["iSTFTGenerator"]
2
- from lt_utils.common import *
3
- from lt_tensor.torch_commons import *
4
- from lt_tensor.model_zoo.residual import ConvNets, ResBlocks1D, ResBlock1D, ResBlock1D2
5
-
6
-
7
- class iSTFTGenerator(ConvNets):
8
- def __init__(
9
- self,
10
- in_channels: int = 80,
11
- upsample_rates: List[Union[int, List[int]]] = [8, 8],
12
- upsample_kernel_sizes: List[Union[int, List[int]]] = [16, 16],
13
- upsample_initial_channel: int = 512,
14
- resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
15
- resblock_dilation_sizes: List[Union[int, List[int]]] = [
16
- [1, 3, 5],
17
- [1, 3, 5],
18
- [1, 3, 5],
19
- ],
20
- n_fft: int = 16,
21
- activation: nn.Module = nn.LeakyReLU(0.1),
22
- hop_length: int = 256,
23
- residual_cls: Union[ResBlock1D, ResBlock1D2] = ResBlock1D
24
- ):
25
- super().__init__()
26
- self.num_kernels = len(resblock_kernel_sizes)
27
- self.num_upsamples = len(upsample_rates)
28
- self.hop_length = hop_length
29
- self.conv_pre = weight_norm(
30
- nn.Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)
31
- )
32
- self.blocks = nn.ModuleList()
33
- self.activation = activation
34
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
35
- self.blocks.append(
36
- self._make_blocks(
37
- (i, k, u),
38
- upsample_initial_channel,
39
- resblock_kernel_sizes,
40
- resblock_dilation_sizes,
41
- residual_cls
42
- )
43
- )
44
-
45
- ch = upsample_initial_channel // (2 ** (i + 1))
46
- self.post_n_fft = n_fft // 2 + 1
47
- self.conv_post = weight_norm(nn.Conv1d(ch, n_fft + 2, 7, 1, padding=3))
48
- self.conv_post.apply(self.init_weights)
49
- self.reflection_pad = nn.ReflectionPad1d((1, 0))
50
-
51
- def _make_blocks(
52
- self,
53
- state: Tuple[int, int, int],
54
- upsample_initial_channel: int,
55
- resblock_kernel_sizes: List[Union[int, List[int]]],
56
- resblock_dilation_sizes: List[int | List[int]],
57
- residual: nn.Module
58
- ):
59
- i, k, u = state
60
- channels = upsample_initial_channel // (2 ** (i + 1))
61
- return nn.ModuleDict(
62
- dict(
63
- up=nn.Sequential(
64
- self.activation,
65
- weight_norm(
66
- nn.ConvTranspose1d(
67
- upsample_initial_channel // (2**i),
68
- channels,
69
- k,
70
- u,
71
- padding=(k - u) // 2,
72
- )
73
- ).apply(self.init_weights),
74
- ),
75
- residual=ResBlocks1D(
76
- channels,
77
- resblock_kernel_sizes,
78
- resblock_dilation_sizes,
79
- self.activation,
80
- residual
81
- ),
82
- )
83
- )
84
-
85
- def forward(self, x):
86
- x = self.conv_pre(x)
87
- for block in self.blocks:
88
- x = block["up"](x)
89
- x = block["residual"](x)
90
-
91
- x = self.conv_post(self.activation(self.reflection_pad(x)))
92
- spec = torch.exp(x[:, : self.post_n_fft, :])
93
- phase = torch.sin(x[:, self.post_n_fft :, :])
94
- return spec, phase
@@ -1,142 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import math
4
- from einops import repeat
5
-
6
-
7
- class SineGen(nn.Module):
8
- def __init__(
9
- self,
10
- samp_rate,
11
- upsample_scale,
12
- harmonic_num=0,
13
- sine_amp=0.1,
14
- noise_std=0.003,
15
- voiced_threshold=0,
16
- flag_for_pulse=False,
17
- ):
18
- super().__init__()
19
- self.sampling_rate = samp_rate
20
- self.upsample_scale = upsample_scale
21
- self.harmonic_num = harmonic_num
22
- self.sine_amp = sine_amp
23
- self.noise_std = noise_std
24
- self.voiced_threshold = voiced_threshold
25
- self.flag_for_pulse = flag_for_pulse
26
- self.dim = self.harmonic_num + 1 # fundamental + harmonics
27
-
28
- def _f02uv_b(self, f0):
29
- return (f0 > self.voiced_threshold).float() # [B, T]
30
-
31
- def _f02uv(self, f0):
32
- return (f0 > self.voiced_threshold).float().unsqueeze(-1) # -> (B, T, 1)
33
-
34
- @torch.no_grad()
35
- def _f02sine(self, f0_values):
36
- """
37
- f0_values: (B, T, 1)
38
- Output: sine waves (B, T * upsample, dim)
39
- """
40
- B, T, _ = f0_values.size()
41
- f0_upsampled = repeat(
42
- f0_values, "b t d -> b (t r) d", r=self.upsample_scale
43
- ) # (B, T_up, 1)
44
-
45
- # Create harmonics
46
- harmonics = (
47
- torch.arange(1, self.dim + 1, device=f0_values.device)
48
- .float()
49
- .view(1, 1, -1)
50
- )
51
- f0_harm = f0_upsampled * harmonics # (B, T_up, dim)
52
-
53
- # Convert Hz to radians (2πf/sr), then integrate to get phase
54
- rad_values = f0_harm / self.sampling_rate # normalized freq
55
- rad_values = rad_values % 1.0 # remove multiples of 2π
56
-
57
- # Random initial phase for each harmonic (except 0th if pulse mode)
58
- if self.flag_for_pulse:
59
- rand_ini = torch.zeros((B, 1, self.dim), device=f0_values.device)
60
- else:
61
- rand_ini = torch.rand((B, 1, self.dim), device=f0_values.device)
62
-
63
- rand_ini = rand_ini * 2 * math.pi
64
-
65
- # Compute cumulative phase
66
- rad_values = rad_values * 2 * math.pi
67
- phase = torch.cumsum(rad_values, dim=1) + rand_ini # (B, T_up, dim)
68
-
69
- sine_waves = torch.sin(phase) # (B, T_up, dim)
70
- return sine_waves
71
-
72
- def _forward(self, f0):
73
- """
74
- f0: (B, T, 1)
75
- returns: sine signal with harmonics and noise added
76
- """
77
- sine_waves = self._f02sine(f0) # (B, T_up, dim)
78
- uv = self._f02uv_b(f0) # (B, T, 1)
79
- uv = repeat(uv, "b t d -> b (t r) d", r=self.upsample_scale) # (B, T_up, 1)
80
-
81
- # voiced sine + unvoiced noise
82
- sine_signal = self.sine_amp * sine_waves * uv # (B, T_up, dim)
83
- noise = torch.randn_like(sine_signal) * self.noise_std
84
- output = sine_signal + noise * (1.0 - uv) # noise added only on unvoiced
85
-
86
- return output # (B, T_up, dim)
87
-
88
- def forward(self, f0):
89
- """
90
- Args:
91
- f0: (B, T) in Hz (before upsampling)
92
- Returns:
93
- sine_waves: (B, T_up, dim)
94
- uv: (B, T_up, 1)
95
- noise: (B, T_up, 1)
96
- """
97
- B, T = f0.shape
98
- device = f0.device
99
-
100
- # Get uv mask (before upsampling)
101
- uv = self._f02uv(f0) # (B, T, 1)
102
-
103
- # Expand f0 to include harmonics: (B, T, dim)
104
- f0 = f0.unsqueeze(-1) # (B, T, 1)
105
- harmonics = (
106
- torch.arange(1, self.dim + 1, device=device).float().view(1, 1, -1)
107
- ) # (1, 1, dim)
108
- f0_harm = f0 * harmonics # (B, T, dim)
109
-
110
- # Upsample
111
- f0_harm_up = repeat(
112
- f0_harm, "b t d -> b (t r) d", r=self.upsample_scale
113
- ) # (B, T_up, dim)
114
- uv_up = repeat(uv, "b t d -> b (t r) d", r=self.upsample_scale) # (B, T_up, 1)
115
-
116
- # Convert to radians
117
- rad_per_sample = f0_harm_up / self.sampling_rate # Hz → cycles/sample
118
- rad_per_sample = rad_per_sample * 2 * math.pi # cycles → radians/sample
119
-
120
- # Random phase init for each sample
121
- B, T_up, D = rad_per_sample.shape
122
- rand_phase = torch.rand(B, D, device=device) * 2 * math.pi # (B, D)
123
-
124
- # Compute cumulative phase
125
- phase = torch.cumsum(rad_per_sample, dim=1) + rand_phase.unsqueeze(
126
- 1
127
- ) # (B, T_up, D)
128
-
129
- # Apply sine
130
- sine_waves = torch.sin(phase) * self.sine_amp # (B, T_up, D)
131
-
132
- # Handle unvoiced: create noise only for fundamental
133
- noise = torch.randn(B, T_up, 1, device=device) * self.noise_std
134
- if self.flag_for_pulse:
135
- # If pulse mode is on, align phase at start of voiced segments
136
- # Optional and tricky to implement — may require segmenting uv
137
- pass
138
-
139
- # Replace sine by noise for unvoiced (only on fundamental)
140
- sine_waves[:, :, 0:1] = sine_waves[:, :, 0:1] * uv_up + noise * (1 - uv_up)
141
-
142
- return sine_waves, uv_up, noise