lt-tensor 0.0.1a11__py3-none-any.whl → 0.0.1a13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,55 @@
1
- from ..torch_commons import *
2
- from ..model_base import Model
3
- from .rsd import ResBlocks
4
- from ..misc_utils import log_tensor
5
-
1
+ __all__ = ["iSTFTGenerator", "ResBlocks"]
2
+ import gc
3
+ import math
4
+ import itertools
5
+ from lt_utils.common import *
6
+ from lt_tensor.torch_commons import *
7
+ from lt_tensor.model_base import Model
8
+ from lt_tensor.misc_utils import log_tensor
9
+ from lt_tensor.model_zoo.residual import ResBlock1D, ConvNets, get_weight_norm
10
+ from lt_utils.misc_utils import log_traceback
11
+ from lt_tensor.processors import AudioProcessor
12
+ from lt_utils.type_utils import is_dir, is_pathlike
13
+ from lt_tensor.misc_utils import set_seed, clear_cache
14
+ from lt_tensor.model_zoo.discriminator import MultiPeriodDiscriminator, MultiScaleDiscriminator
6
15
  import torch.nn.functional as F
16
+ from lt_tensor.config_templates import updateDict, ModelConfig
7
17
 
8
18
 
9
- class Generator(Model):
10
- """Based on the adaptation made by from Rishikesh
11
- A Generator for audio processing, can be usd for tother things."""
19
+ class ResBlocks(ConvNets):
20
+ def __init__(
21
+ self,
22
+ channels: int,
23
+ resblock_kernel_sizes: List[Union[int, List[int]]] = [3, 7, 11],
24
+ resblock_dilation_sizes: List[Union[int, List[int]]] = [
25
+ [1, 3, 5],
26
+ [1, 3, 5],
27
+ [1, 3, 5],
28
+ ],
29
+ activation: nn.Module = nn.LeakyReLU(0.1),
30
+ ):
31
+ super().__init__()
32
+ self.num_kernels = len(resblock_kernel_sizes)
33
+ self.rb = nn.ModuleList()
34
+ self.activation = activation
35
+
36
+ for k, j in zip(resblock_kernel_sizes, resblock_dilation_sizes):
37
+ self.rb.append(ResBlock1D(channels, k, j, activation))
38
+
39
+ self.rb.apply(self.init_weights)
40
+
41
+ def forward(self, x: torch.Tensor):
42
+ xs = None
43
+ for i, block in enumerate(self.rb):
44
+ if i == 0:
45
+ xs = block(x)
46
+ else:
47
+ xs += block(x)
48
+ x = xs / self.num_kernels
49
+ return self.activation(x)
50
+
12
51
 
52
+ class iSTFTGenerator(ConvNets):
13
53
  def __init__(
14
54
  self,
15
55
  in_channels: int = 80,
@@ -24,10 +64,12 @@ class Generator(Model):
24
64
  ],
25
65
  n_fft: int = 16,
26
66
  activation: nn.Module = nn.LeakyReLU(0.1),
67
+ hop_length: int = 256,
27
68
  ):
28
69
  super().__init__()
29
70
  self.num_kernels = len(resblock_kernel_sizes)
30
71
  self.num_upsamples = len(upsample_rates)
72
+ self.hop_length = hop_length
31
73
  self.conv_pre = weight_norm(
32
74
  nn.Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)
33
75
  )
@@ -47,7 +89,20 @@ class Generator(Model):
47
89
  self.post_n_fft = n_fft // 2 + 1
48
90
  self.conv_post = weight_norm(nn.Conv1d(ch, n_fft + 2, 7, 1, padding=3))
49
91
  self.conv_post.apply(self.init_weights)
50
- self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
92
+ self.reflection_pad = nn.ReflectionPad1d((1, 0))
93
+
94
+ self.phase = nn.Sequential(
95
+ nn.LeakyReLU(0.2),
96
+ nn.Conv1d(self.post_n_fft, self.post_n_fft, kernel_size=3, padding=1),
97
+ nn.LeakyReLU(0.2),
98
+ nn.Conv1d(self.post_n_fft, self.post_n_fft, kernel_size=3, padding=1),
99
+ )
100
+ self.spec = nn.Sequential(
101
+ nn.LeakyReLU(0.2),
102
+ nn.Conv1d(self.post_n_fft, self.post_n_fft, kernel_size=3, padding=1),
103
+ nn.LeakyReLU(0.2),
104
+ nn.Conv1d(self.post_n_fft, self.post_n_fft, kernel_size=3, padding=1),
105
+ )
51
106
 
52
107
  def _make_blocks(
53
108
  self,
@@ -70,7 +125,7 @@ class Generator(Model):
70
125
  u,
71
126
  padding=(k - u) // 2,
72
127
  )
73
- ),
128
+ ).apply(self.init_weights),
74
129
  ),
75
130
  residual=ResBlocks(
76
131
  channels,
@@ -89,20 +144,7 @@ class Generator(Model):
89
144
 
90
145
  x = self.reflection_pad(x)
91
146
  x = self.conv_post(x)
92
- spec = torch.exp(x[:, : self.post_n_fft, :])
93
- phase = torch.sin(x[:, self.post_n_fft :, :])
147
+ spec = torch.exp(self.spec(x[:, : self.post_n_fft, :]))
148
+ phase = torch.sin(self.phase(x[:, self.post_n_fft :, :]))
94
149
 
95
150
  return spec, phase
96
-
97
- def remove_weight_norm(self):
98
- for module in self.modules():
99
- try:
100
- remove_weight_norm(module)
101
- except ValueError:
102
- pass # Not normed, skip
103
-
104
- @staticmethod
105
- def init_weights(m, mean=0.0, std=0.01):
106
- classname = m.__class__.__name__
107
- if "Conv" in classname:
108
- m.weight.data.normal_(mean, std)
@@ -0,0 +1,142 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+ from einops import repeat
5
+
6
+
7
+ class SineGen(nn.Module):
8
+ def __init__(
9
+ self,
10
+ samp_rate,
11
+ upsample_scale,
12
+ harmonic_num=0,
13
+ sine_amp=0.1,
14
+ noise_std=0.003,
15
+ voiced_threshold=0,
16
+ flag_for_pulse=False,
17
+ ):
18
+ super().__init__()
19
+ self.sampling_rate = samp_rate
20
+ self.upsample_scale = upsample_scale
21
+ self.harmonic_num = harmonic_num
22
+ self.sine_amp = sine_amp
23
+ self.noise_std = noise_std
24
+ self.voiced_threshold = voiced_threshold
25
+ self.flag_for_pulse = flag_for_pulse
26
+ self.dim = self.harmonic_num + 1 # fundamental + harmonics
27
+
28
+ def _f02uv_b(self, f0):
29
+ return (f0 > self.voiced_threshold).float() # [B, T]
30
+
31
+ def _f02uv(self, f0):
32
+ return (f0 > self.voiced_threshold).float().unsqueeze(-1) # -> (B, T, 1)
33
+
34
+ @torch.no_grad()
35
+ def _f02sine(self, f0_values):
36
+ """
37
+ f0_values: (B, T, 1)
38
+ Output: sine waves (B, T * upsample, dim)
39
+ """
40
+ B, T, _ = f0_values.size()
41
+ f0_upsampled = repeat(
42
+ f0_values, "b t d -> b (t r) d", r=self.upsample_scale
43
+ ) # (B, T_up, 1)
44
+
45
+ # Create harmonics
46
+ harmonics = (
47
+ torch.arange(1, self.dim + 1, device=f0_values.device)
48
+ .float()
49
+ .view(1, 1, -1)
50
+ )
51
+ f0_harm = f0_upsampled * harmonics # (B, T_up, dim)
52
+
53
+ # Convert Hz to radians (2πf/sr), then integrate to get phase
54
+ rad_values = f0_harm / self.sampling_rate # normalized freq
55
+ rad_values = rad_values % 1.0 # remove multiples of 2π
56
+
57
+ # Random initial phase for each harmonic (except 0th if pulse mode)
58
+ if self.flag_for_pulse:
59
+ rand_ini = torch.zeros((B, 1, self.dim), device=f0_values.device)
60
+ else:
61
+ rand_ini = torch.rand((B, 1, self.dim), device=f0_values.device)
62
+
63
+ rand_ini = rand_ini * 2 * math.pi
64
+
65
+ # Compute cumulative phase
66
+ rad_values = rad_values * 2 * math.pi
67
+ phase = torch.cumsum(rad_values, dim=1) + rand_ini # (B, T_up, dim)
68
+
69
+ sine_waves = torch.sin(phase) # (B, T_up, dim)
70
+ return sine_waves
71
+
72
+ def _forward(self, f0):
73
+ """
74
+ f0: (B, T, 1)
75
+ returns: sine signal with harmonics and noise added
76
+ """
77
+ sine_waves = self._f02sine(f0) # (B, T_up, dim)
78
+ uv = self._f02uv_b(f0) # (B, T, 1)
79
+ uv = repeat(uv, "b t d -> b (t r) d", r=self.upsample_scale) # (B, T_up, 1)
80
+
81
+ # voiced sine + unvoiced noise
82
+ sine_signal = self.sine_amp * sine_waves * uv # (B, T_up, dim)
83
+ noise = torch.randn_like(sine_signal) * self.noise_std
84
+ output = sine_signal + noise * (1.0 - uv) # noise added only on unvoiced
85
+
86
+ return output # (B, T_up, dim)
87
+
88
+ def forward(self, f0):
89
+ """
90
+ Args:
91
+ f0: (B, T) in Hz (before upsampling)
92
+ Returns:
93
+ sine_waves: (B, T_up, dim)
94
+ uv: (B, T_up, 1)
95
+ noise: (B, T_up, 1)
96
+ """
97
+ B, T = f0.shape
98
+ device = f0.device
99
+
100
+ # Get uv mask (before upsampling)
101
+ uv = self._f02uv(f0) # (B, T, 1)
102
+
103
+ # Expand f0 to include harmonics: (B, T, dim)
104
+ f0 = f0.unsqueeze(-1) # (B, T, 1)
105
+ harmonics = (
106
+ torch.arange(1, self.dim + 1, device=device).float().view(1, 1, -1)
107
+ ) # (1, 1, dim)
108
+ f0_harm = f0 * harmonics # (B, T, dim)
109
+
110
+ # Upsample
111
+ f0_harm_up = repeat(
112
+ f0_harm, "b t d -> b (t r) d", r=self.upsample_scale
113
+ ) # (B, T_up, dim)
114
+ uv_up = repeat(uv, "b t d -> b (t r) d", r=self.upsample_scale) # (B, T_up, 1)
115
+
116
+ # Convert to radians
117
+ rad_per_sample = f0_harm_up / self.sampling_rate # Hz → cycles/sample
118
+ rad_per_sample = rad_per_sample * 2 * math.pi # cycles → radians/sample
119
+
120
+ # Random phase init for each sample
121
+ B, T_up, D = rad_per_sample.shape
122
+ rand_phase = torch.rand(B, D, device=device) * 2 * math.pi # (B, D)
123
+
124
+ # Compute cumulative phase
125
+ phase = torch.cumsum(rad_per_sample, dim=1) + rand_phase.unsqueeze(
126
+ 1
127
+ ) # (B, T_up, D)
128
+
129
+ # Apply sine
130
+ sine_waves = torch.sin(phase) * self.sine_amp # (B, T_up, D)
131
+
132
+ # Handle unvoiced: create noise only for fundamental
133
+ noise = torch.randn(B, T_up, 1, device=device) * self.noise_std
134
+ if self.flag_for_pulse:
135
+ # If pulse mode is on, align phase at start of voiced segments
136
+ # Optional and tricky to implement — may require segmenting uv
137
+ pass
138
+
139
+ # Replace sine by noise for unvoiced (only on fundamental)
140
+ sine_waves[:, :, 0:1] = sine_waves[:, :, 0:1] * uv_up + noise * (1 - uv_up)
141
+
142
+ return sine_waves, uv_up, noise