minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. cosyvoice/__init__.py +17 -0
  2. cosyvoice/bin/average_model.py +93 -0
  3. cosyvoice/bin/export_jit.py +103 -0
  4. cosyvoice/bin/export_onnx.py +120 -0
  5. cosyvoice/bin/inference_deprecated.py +126 -0
  6. cosyvoice/bin/train.py +195 -0
  7. cosyvoice/cli/__init__.py +0 -0
  8. cosyvoice/cli/cosyvoice.py +209 -0
  9. cosyvoice/cli/frontend.py +238 -0
  10. cosyvoice/cli/model.py +386 -0
  11. cosyvoice/dataset/__init__.py +0 -0
  12. cosyvoice/dataset/dataset.py +151 -0
  13. cosyvoice/dataset/processor.py +434 -0
  14. cosyvoice/flow/decoder.py +494 -0
  15. cosyvoice/flow/flow.py +281 -0
  16. cosyvoice/flow/flow_matching.py +227 -0
  17. cosyvoice/flow/length_regulator.py +70 -0
  18. cosyvoice/hifigan/discriminator.py +230 -0
  19. cosyvoice/hifigan/f0_predictor.py +58 -0
  20. cosyvoice/hifigan/generator.py +582 -0
  21. cosyvoice/hifigan/hifigan.py +67 -0
  22. cosyvoice/llm/llm.py +610 -0
  23. cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  24. cosyvoice/tokenizer/tokenizer.py +279 -0
  25. cosyvoice/transformer/__init__.py +0 -0
  26. cosyvoice/transformer/activation.py +84 -0
  27. cosyvoice/transformer/attention.py +330 -0
  28. cosyvoice/transformer/convolution.py +145 -0
  29. cosyvoice/transformer/decoder.py +396 -0
  30. cosyvoice/transformer/decoder_layer.py +132 -0
  31. cosyvoice/transformer/embedding.py +302 -0
  32. cosyvoice/transformer/encoder.py +474 -0
  33. cosyvoice/transformer/encoder_layer.py +236 -0
  34. cosyvoice/transformer/label_smoothing_loss.py +96 -0
  35. cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  36. cosyvoice/transformer/subsampling.py +383 -0
  37. cosyvoice/transformer/upsample_encoder.py +320 -0
  38. cosyvoice/utils/__init__.py +0 -0
  39. cosyvoice/utils/class_utils.py +83 -0
  40. cosyvoice/utils/common.py +186 -0
  41. cosyvoice/utils/executor.py +176 -0
  42. cosyvoice/utils/file_utils.py +129 -0
  43. cosyvoice/utils/frontend_utils.py +136 -0
  44. cosyvoice/utils/losses.py +57 -0
  45. cosyvoice/utils/mask.py +265 -0
  46. cosyvoice/utils/scheduler.py +738 -0
  47. cosyvoice/utils/train_utils.py +367 -0
  48. cosyvoice/vllm/cosyvoice2.py +103 -0
  49. matcha/__init__.py +0 -0
  50. matcha/app.py +357 -0
  51. matcha/cli.py +418 -0
  52. matcha/hifigan/__init__.py +0 -0
  53. matcha/hifigan/config.py +28 -0
  54. matcha/hifigan/denoiser.py +64 -0
  55. matcha/hifigan/env.py +17 -0
  56. matcha/hifigan/meldataset.py +217 -0
  57. matcha/hifigan/models.py +368 -0
  58. matcha/hifigan/xutils.py +60 -0
  59. matcha/models/__init__.py +0 -0
  60. matcha/models/baselightningmodule.py +209 -0
  61. matcha/models/components/__init__.py +0 -0
  62. matcha/models/components/decoder.py +443 -0
  63. matcha/models/components/flow_matching.py +132 -0
  64. matcha/models/components/text_encoder.py +410 -0
  65. matcha/models/components/transformer.py +316 -0
  66. matcha/models/matcha_tts.py +239 -0
  67. matcha/onnx/__init__.py +0 -0
  68. matcha/onnx/export.py +181 -0
  69. matcha/onnx/infer.py +168 -0
  70. matcha/text/__init__.py +53 -0
  71. matcha/text/cleaners.py +116 -0
  72. matcha/text/numbers.py +71 -0
  73. matcha/text/symbols.py +17 -0
  74. matcha/train.py +122 -0
  75. matcha/utils/__init__.py +5 -0
  76. matcha/utils/audio.py +82 -0
  77. matcha/utils/generate_data_statistics.py +111 -0
  78. matcha/utils/instantiators.py +56 -0
  79. matcha/utils/logging_utils.py +53 -0
  80. matcha/utils/model.py +90 -0
  81. matcha/utils/monotonic_align/__init__.py +22 -0
  82. matcha/utils/monotonic_align/setup.py +7 -0
  83. matcha/utils/pylogger.py +21 -0
  84. matcha/utils/rich_utils.py +101 -0
  85. matcha/utils/utils.py +219 -0
  86. minicpmo/__init__.py +24 -0
  87. minicpmo/utils.py +636 -0
  88. minicpmo/version.py +2 -0
  89. minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
  90. minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
  91. minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
  92. minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
  93. s3tokenizer/__init__.py +153 -0
  94. s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
  95. s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
  96. s3tokenizer/assets/mel_filters.npz +0 -0
  97. s3tokenizer/cli.py +183 -0
  98. s3tokenizer/model.py +546 -0
  99. s3tokenizer/model_v2.py +605 -0
  100. s3tokenizer/utils.py +390 -0
  101. stepaudio2/__init__.py +40 -0
  102. stepaudio2/cosyvoice2/__init__.py +1 -0
  103. stepaudio2/cosyvoice2/flow/__init__.py +0 -0
  104. stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
  105. stepaudio2/cosyvoice2/flow/flow.py +230 -0
  106. stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
  107. stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
  108. stepaudio2/cosyvoice2/transformer/attention.py +328 -0
  109. stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
  110. stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
  111. stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
  112. stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
  113. stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
  114. stepaudio2/cosyvoice2/utils/__init__.py +1 -0
  115. stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
  116. stepaudio2/cosyvoice2/utils/common.py +101 -0
  117. stepaudio2/cosyvoice2/utils/mask.py +49 -0
  118. stepaudio2/flashcosyvoice/__init__.py +0 -0
  119. stepaudio2/flashcosyvoice/cli.py +424 -0
  120. stepaudio2/flashcosyvoice/config.py +80 -0
  121. stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
  122. stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
  123. stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
  124. stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
  125. stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
  126. stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
  127. stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
  128. stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
  129. stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
  130. stepaudio2/flashcosyvoice/modules/flow.py +198 -0
  131. stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
  132. stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
  133. stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
  134. stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
  135. stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
  136. stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
  137. stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
  138. stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
  139. stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
  140. stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
  141. stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
  142. stepaudio2/flashcosyvoice/utils/audio.py +77 -0
  143. stepaudio2/flashcosyvoice/utils/context.py +28 -0
  144. stepaudio2/flashcosyvoice/utils/loader.py +116 -0
  145. stepaudio2/flashcosyvoice/utils/memory.py +19 -0
  146. stepaudio2/stepaudio2.py +204 -0
  147. stepaudio2/token2wav.py +248 -0
  148. stepaudio2/utils.py +91 -0
@@ -0,0 +1,249 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """HIFI-GAN"""
16
+
17
+ from typing import Dict, List
18
+
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from scipy.signal import get_window
24
+ from torch.nn import Conv1d, ConvTranspose1d
25
+ from torch.nn.utils import remove_weight_norm
26
+
27
+ try:
28
+ from torch.nn.utils.parametrizations import weight_norm
29
+ except ImportError:
30
+ from torch.nn.utils import weight_norm # noqa
31
+
32
+ from stepaudio2.flashcosyvoice.modules.hifigan_components.layers import (
33
+ ResBlock, SourceModuleHnNSF, SourceModuleHnNSF2, init_weights)
34
+
35
+
36
+ class ConvRNNF0Predictor(nn.Module):
37
+ def __init__(self,
38
+ num_class: int = 1,
39
+ in_channels: int = 80,
40
+ cond_channels: int = 512
41
+ ):
42
+ super().__init__()
43
+
44
+ self.num_class = num_class
45
+ self.condnet = nn.Sequential(
46
+ weight_norm( # noqa
47
+ nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
48
+ ),
49
+ nn.ELU(),
50
+ weight_norm( # noqa
51
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
52
+ ),
53
+ nn.ELU(),
54
+ weight_norm( # noqa
55
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
56
+ ),
57
+ nn.ELU(),
58
+ weight_norm( # noqa
59
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
60
+ ),
61
+ nn.ELU(),
62
+ weight_norm( # noqa
63
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
64
+ ),
65
+ nn.ELU(),
66
+ )
67
+ self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
68
+
69
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
70
+ x = self.condnet(x)
71
+ x = x.transpose(1, 2)
72
+ return torch.abs(self.classifier(x).squeeze(-1))
73
+
74
+
75
+ class HiFTGenerator(nn.Module):
76
+ """
77
+ HiFTNet Generator: Neural Source Filter + ISTFTNet
78
+ https://arxiv.org/abs/2309.09493
79
+ """
80
+ def __init__(
81
+ self,
82
+ in_channels: int = 80,
83
+ base_channels: int = 512,
84
+ nb_harmonics: int = 8,
85
+ sampling_rate: int = 24000,
86
+ nsf_alpha: float = 0.1,
87
+ nsf_sigma: float = 0.003,
88
+ nsf_voiced_threshold: float = 10,
89
+ upsample_rates: List[int] = [8, 5, 3], # noqa
90
+ upsample_kernel_sizes: List[int] = [16, 11, 7], # noqa
91
+ istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4}, # noqa
92
+ resblock_kernel_sizes: List[int] = [3, 7, 11], # noqa
93
+ resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], # noqa
94
+ source_resblock_kernel_sizes: List[int] = [7, 7, 11], # noqa
95
+ source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], # noqa
96
+ lrelu_slope: float = 0.1,
97
+ audio_limit: float = 0.99,
98
+ f0_predictor: torch.nn.Module = None,
99
+ ):
100
+ super(HiFTGenerator, self).__init__()
101
+
102
+ self.out_channels = 1
103
+ self.nb_harmonics = nb_harmonics
104
+ self.sampling_rate = sampling_rate
105
+ self.istft_params = istft_params
106
+ self.lrelu_slope = lrelu_slope
107
+ self.audio_limit = audio_limit
108
+
109
+ self.num_kernels = len(resblock_kernel_sizes)
110
+ self.num_upsamples = len(upsample_rates)
111
+ # NOTE in CosyVoice2, we use the original SourceModuleHnNSF implementation
112
+ this_SourceModuleHnNSF = SourceModuleHnNSF if self.sampling_rate == 22050 else SourceModuleHnNSF2
113
+ self.m_source = this_SourceModuleHnNSF(
114
+ sampling_rate=sampling_rate,
115
+ upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
116
+ harmonic_num=nb_harmonics,
117
+ sine_amp=nsf_alpha,
118
+ add_noise_std=nsf_sigma,
119
+ voiced_threshod=nsf_voiced_threshold)
120
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
121
+
122
+ self.conv_pre = weight_norm( # noqa
123
+ Conv1d(in_channels, base_channels, 7, 1, padding=3)
124
+ )
125
+
126
+ # Up
127
+ self.ups = nn.ModuleList()
128
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
129
+ self.ups.append(
130
+ weight_norm( # noqa
131
+ ConvTranspose1d(
132
+ base_channels // (2**i),
133
+ base_channels // (2**(i + 1)),
134
+ k,
135
+ u,
136
+ padding=(k - u) // 2,
137
+ )
138
+ )
139
+ )
140
+
141
+ # Down
142
+ self.source_downs = nn.ModuleList()
143
+ self.source_resblocks = nn.ModuleList()
144
+ downsample_rates = [1] + upsample_rates[::-1][:-1]
145
+ downsample_cum_rates = np.cumprod(downsample_rates)
146
+ for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
147
+ if u == 1:
148
+ self.source_downs.append(
149
+ Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
150
+ )
151
+ else:
152
+ self.source_downs.append(
153
+ Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
154
+ )
155
+
156
+ self.source_resblocks.append(
157
+ ResBlock(base_channels // (2 ** (i + 1)), k, d)
158
+ )
159
+
160
+ self.resblocks = nn.ModuleList()
161
+ for i in range(len(self.ups)):
162
+ ch = base_channels // (2**(i + 1))
163
+ for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
164
+ self.resblocks.append(ResBlock(ch, k, d))
165
+
166
+ self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3)) # noqa
167
+ self.ups.apply(init_weights)
168
+ self.conv_post.apply(init_weights)
169
+ self.reflection_pad = nn.ReflectionPad1d((1, 0))
170
+ self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
171
+ self.f0_predictor = ConvRNNF0Predictor() if f0_predictor is None else f0_predictor
172
+
173
+ def remove_weight_norm(self):
174
+ print('Removing weight norm...')
175
+ for up in self.ups:
176
+ remove_weight_norm(up)
177
+ for resblock in self.resblocks:
178
+ resblock.remove_weight_norm()
179
+ remove_weight_norm(self.conv_pre)
180
+ remove_weight_norm(self.conv_post)
181
+ self.m_source.remove_weight_norm()
182
+ for source_down in self.source_downs:
183
+ remove_weight_norm(source_down)
184
+ for source_resblock in self.source_resblocks:
185
+ source_resblock.remove_weight_norm()
186
+
187
+ def _stft(self, x):
188
+ spec = torch.stft(
189
+ x,
190
+ self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
191
+ return_complex=True)
192
+ spec = torch.view_as_real(spec) # [B, F, TT, 2]
193
+ return spec[..., 0], spec[..., 1]
194
+
195
+ def _istft(self, magnitude, phase):
196
+ magnitude = torch.clip(magnitude, max=1e2)
197
+ real = magnitude * torch.cos(phase)
198
+ img = magnitude * torch.sin(phase)
199
+ inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
200
+ self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
201
+ return inverse_transform
202
+
203
+ def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
204
+ s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
205
+ s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
206
+
207
+ x = self.conv_pre(x)
208
+ for i in range(self.num_upsamples):
209
+ x = F.leaky_relu(x, self.lrelu_slope)
210
+ x = self.ups[i](x)
211
+
212
+ if i == self.num_upsamples - 1:
213
+ x = self.reflection_pad(x)
214
+
215
+ # fusion
216
+ si = self.source_downs[i](s_stft)
217
+ si = self.source_resblocks[i](si)
218
+ x = x + si
219
+
220
+ xs = None
221
+ for j in range(self.num_kernels):
222
+ if xs is None:
223
+ xs = self.resblocks[i * self.num_kernels + j](x)
224
+ else:
225
+ xs += self.resblocks[i * self.num_kernels + j](x)
226
+ x = xs / self.num_kernels
227
+
228
+ x = F.leaky_relu(x)
229
+ x = self.conv_post(x)
230
+ magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
231
+ phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy
232
+
233
+ x = self._istft(magnitude, phase)
234
+ x = torch.clamp(x, -self.audio_limit, self.audio_limit)
235
+ return x
236
+
237
+ @torch.inference_mode()
238
+ def forward(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
239
+ # mel->f0
240
+ f0 = self.f0_predictor(speech_feat)
241
+ # f0->source
242
+ s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
243
+ s, _, _ = self.m_source(s)
244
+ s = s.transpose(1, 2)
245
+ # use cache_source to avoid glitch
246
+ if cache_source.shape[2] != 0:
247
+ s[:, :, :cache_source.shape[2]] = cache_source
248
+ generated_speech = self.decode(x=speech_feat, s=s)
249
+ return generated_speech, s
@@ -0,0 +1,433 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from torch.distributions.uniform import Uniform
7
+ from torch.nn import Conv1d
8
+ from torch.nn.utils import remove_weight_norm
9
+
10
+ try:
11
+ from torch.nn.utils.parametrizations import weight_norm
12
+ except ImportError:
13
+ from torch.nn.utils import weight_norm # noqa
14
+
15
+
16
+ def get_padding(kernel_size, dilation=1):
17
+ return int((kernel_size * dilation - dilation) / 2)
18
+
19
+
20
+ def init_weights(m, mean=0.0, std=0.01):
21
+ classname = m.__class__.__name__
22
+ if classname.find("Conv") != -1:
23
+ m.weight.data.normal_(mean, std)
24
+
25
+
26
+ """hifigan based generator implementation.
27
+
28
+ This code is modified from https://github.com/jik876/hifi-gan
29
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
30
+ https://github.com/NVIDIA/BigVGAN
31
+
32
+ """
33
+
34
+
35
+ # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
36
+ # LICENSE is in incl_licenses directory.
37
+ class Snake(nn.Module):
38
+ '''
39
+ Implementation of a sine-based periodic activation function
40
+ Shape:
41
+ - Input: (B, C, T)
42
+ - Output: (B, C, T), same shape as the input
43
+ Parameters:
44
+ - alpha - trainable parameter
45
+ References:
46
+ - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
47
+ https://arxiv.org/abs/2006.08195
48
+ Examples:
49
+ >>> a1 = snake(256)
50
+ >>> x = torch.randn(256)
51
+ >>> x = a1(x)
52
+
53
+ Args:
54
+ in_features: shape of the input
55
+ alpha: trainable parameter
56
+ alpha_trainable: whether alpha is trainable
57
+ alpha_logscale: whether to use log scale for alpha
58
+ alpha is initialized to 1 by default, higher values = higher-frequency.
59
+ alpha will be trained along with the rest of your model.
60
+ '''
61
+ def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
62
+ super(Snake, self).__init__()
63
+ self.in_features = in_features
64
+
65
+ # initialize alpha
66
+ self.alpha_logscale = alpha_logscale
67
+ if self.alpha_logscale: # log scale alphas initialized to zeros
68
+ self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
69
+ else: # linear scale alphas initialized to ones
70
+ self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
71
+
72
+ self.alpha.requires_grad = alpha_trainable
73
+
74
+ self.no_div_by_zero = 0.000000001
75
+
76
+ def forward(self, x):
77
+ '''
78
+ Forward pass of the function.
79
+ Applies the function to the input elementwise.
80
+ Snake ∶= x + 1/a * sin^2 (xa)
81
+ '''
82
+ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
83
+ if self.alpha_logscale:
84
+ alpha = torch.exp(alpha)
85
+ x = x + (1.0 / (alpha + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
86
+
87
+ return x
88
+
89
+
90
+ class ResBlock(torch.nn.Module):
91
+ """Residual block module in HiFiGAN/BigVGAN."""
92
+ def __init__(
93
+ self,
94
+ channels: int = 512,
95
+ kernel_size: int = 3,
96
+ dilations: List[int] = [1, 3, 5], # noqa
97
+ ):
98
+ super(ResBlock, self).__init__()
99
+ self.convs1 = nn.ModuleList()
100
+ self.convs2 = nn.ModuleList()
101
+
102
+ for dilation in dilations:
103
+ self.convs1.append(
104
+ weight_norm( # noqa
105
+ Conv1d(
106
+ channels,
107
+ channels,
108
+ kernel_size,
109
+ 1,
110
+ dilation=dilation,
111
+ padding=get_padding(kernel_size, dilation)
112
+ )
113
+ )
114
+ )
115
+ self.convs2.append(
116
+ weight_norm( # noqa
117
+ Conv1d(
118
+ channels,
119
+ channels,
120
+ kernel_size,
121
+ 1,
122
+ dilation=1,
123
+ padding=get_padding(kernel_size, 1)
124
+ )
125
+ )
126
+ )
127
+ self.convs1.apply(init_weights)
128
+ self.convs2.apply(init_weights)
129
+ self.activations1 = nn.ModuleList([
130
+ Snake(channels, alpha_logscale=False)
131
+ for _ in range(len(self.convs1))
132
+ ])
133
+ self.activations2 = nn.ModuleList([
134
+ Snake(channels, alpha_logscale=False)
135
+ for _ in range(len(self.convs2))
136
+ ])
137
+
138
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
139
+ for idx in range(len(self.convs1)):
140
+ xt = self.activations1[idx](x)
141
+ xt = self.convs1[idx](xt)
142
+ xt = self.activations2[idx](xt)
143
+ xt = self.convs2[idx](xt)
144
+ x = xt + x
145
+ return x
146
+
147
+ def remove_weight_norm(self):
148
+ for idx in range(len(self.convs1)):
149
+ remove_weight_norm(self.convs1[idx])
150
+ remove_weight_norm(self.convs2[idx])
151
+
152
+
153
+ class SineGen(torch.nn.Module):
154
+ """ Definition of sine generator
155
+ SineGen(samp_rate, harmonic_num = 0,
156
+ sine_amp = 0.1, noise_std = 0.003,
157
+ voiced_threshold = 0,
158
+ flag_for_pulse=False)
159
+ samp_rate: sampling rate in Hz
160
+ harmonic_num: number of harmonic overtones (default 0)
161
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
162
+ noise_std: std of Gaussian noise (default 0.003)
163
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
164
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
165
+ Note: when flag_for_pulse is True, the first time step of a voiced
166
+ segment is always sin(np.pi) or cos(0)
167
+ """
168
+
169
+ def __init__(self, samp_rate, harmonic_num=0,
170
+ sine_amp=0.1, noise_std=0.003,
171
+ voiced_threshold=0):
172
+ super(SineGen, self).__init__()
173
+ self.sine_amp = sine_amp
174
+ self.noise_std = noise_std
175
+ self.harmonic_num = harmonic_num
176
+ self.sampling_rate = samp_rate
177
+ self.voiced_threshold = voiced_threshold
178
+
179
+ def _f02uv(self, f0):
180
+ # generate uv signal
181
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
182
+ return uv
183
+
184
+ @torch.no_grad()
185
+ def forward(self, f0):
186
+ """
187
+ :param f0: [B, 1, sample_len], Hz
188
+ :return: [B, 1, sample_len]
189
+ """
190
+
191
+ F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
192
+ for i in range(self.harmonic_num + 1):
193
+ F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
194
+
195
+ theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
196
+ u_dist = Uniform(low=-np.pi, high=np.pi)
197
+ phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
198
+ phase_vec[:, 0, :] = 0
199
+
200
+ # generate sine waveforms
201
+ sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
202
+
203
+ # generate uv signal
204
+ uv = self._f02uv(f0)
205
+
206
+ # noise: for unvoiced should be similar to sine_amp
207
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
208
+ # . for voiced regions is self.noise_std
209
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
210
+ noise = noise_amp * torch.randn_like(sine_waves)
211
+
212
+ # first: set the unvoiced part to 0 by uv
213
+ # then: additive noise
214
+ sine_waves = sine_waves * uv + noise
215
+ return sine_waves, uv, noise
216
+
217
+
218
+ class SourceModuleHnNSF(torch.nn.Module):
219
+ """ SourceModule for hn-nsf
220
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
221
+ add_noise_std=0.003, voiced_threshod=0)
222
+ sampling_rate: sampling_rate in Hz
223
+ harmonic_num: number of harmonic above F0 (default: 0)
224
+ sine_amp: amplitude of sine source signal (default: 0.1)
225
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
226
+ note that amplitude of noise in unvoiced is decided
227
+ by sine_amp
228
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
229
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
230
+ F0_sampled (batchsize, length, 1)
231
+ Sine_source (batchsize, length, 1)
232
+ noise_source (batchsize, length 1)
233
+ uv (batchsize, length, 1)
234
+ """
235
+
236
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
237
+ add_noise_std=0.003, voiced_threshod=0):
238
+ super(SourceModuleHnNSF, self).__init__()
239
+
240
+ self.sine_amp = sine_amp
241
+ self.noise_std = add_noise_std
242
+
243
+ # to produce sine waveforms
244
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
245
+ sine_amp, add_noise_std, voiced_threshod)
246
+
247
+ # to merge source harmonics into a single excitation
248
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
249
+ self.l_tanh = torch.nn.Tanh()
250
+
251
+ def forward(self, x):
252
+ """
253
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
254
+ F0_sampled (batchsize, length, 1)
255
+ Sine_source (batchsize, length, 1)
256
+ noise_source (batchsize, length 1)
257
+ """
258
+ # source for harmonic branch
259
+ with torch.no_grad():
260
+ sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
261
+ sine_wavs = sine_wavs.transpose(1, 2)
262
+ uv = uv.transpose(1, 2)
263
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
264
+
265
+ # source for noise branch, in the same shape as uv
266
+ noise = torch.randn_like(uv) * self.sine_amp / 3
267
+ return sine_merge, noise, uv
268
+
269
+
270
+ class SineGen2(torch.nn.Module):
271
+ """ Definition of sine generator
272
+ SineGen(samp_rate, harmonic_num = 0,
273
+ sine_amp = 0.1, noise_std = 0.003,
274
+ voiced_threshold = 0,
275
+ flag_for_pulse=False)
276
+ samp_rate: sampling rate in Hz
277
+ harmonic_num: number of harmonic overtones (default 0)
278
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
279
+ noise_std: std of Gaussian noise (default 0.003)
280
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
281
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
282
+ Note: when flag_for_pulse is True, the first time step of a voiced
283
+ segment is always sin(np.pi) or cos(0)
284
+ """
285
+
286
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
287
+ sine_amp=0.1, noise_std=0.003,
288
+ voiced_threshold=0,
289
+ flag_for_pulse=False):
290
+ super(SineGen2, self).__init__()
291
+ self.sine_amp = sine_amp
292
+ self.noise_std = noise_std
293
+ self.harmonic_num = harmonic_num
294
+ self.dim = self.harmonic_num + 1
295
+ self.sampling_rate = samp_rate
296
+ self.voiced_threshold = voiced_threshold
297
+ self.flag_for_pulse = flag_for_pulse
298
+ self.upsample_scale = upsample_scale
299
+
300
+ def _f02uv(self, f0):
301
+ # generate uv signal
302
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
303
+ return uv
304
+
305
+ def _f02sine(self, f0_values):
306
+ """ f0_values: (batchsize, length, dim)
307
+ where dim indicates fundamental tone and overtones
308
+ """
309
+ # convert to F0 in rad. The interger part n can be ignored
310
+ # because 2 * np.pi * n doesn't affect phase
311
+ rad_values = (f0_values / self.sampling_rate) % 1
312
+
313
+ # initial phase noise (no noise for fundamental component)
314
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
315
+ rand_ini[:, 0] = 0
316
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
317
+
318
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
319
+ if not self.flag_for_pulse:
320
+ rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
321
+ scale_factor=1 / self.upsample_scale,
322
+ mode="linear").transpose(1, 2)
323
+
324
+ phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
325
+ phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
326
+ scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
327
+ sines = torch.sin(phase)
328
+ else:
329
+ # If necessary, make sure that the first time step of every
330
+ # voiced segments is sin(pi) or cos(0)
331
+ # This is used for pulse-train generation
332
+
333
+ # identify the last time step in unvoiced segments
334
+ uv = self._f02uv(f0_values)
335
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
336
+ uv_1[:, -1, :] = 1
337
+ u_loc = (uv < 1) * (uv_1 > 0)
338
+
339
+ # get the instantanouse phase
340
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
341
+ # different batch needs to be processed differently
342
+ for idx in range(f0_values.shape[0]):
343
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
344
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
345
+ # stores the accumulation of i.phase within
346
+ # each voiced segments
347
+ tmp_cumsum[idx, :, :] = 0
348
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
349
+
350
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
351
+ # within the previous voiced segment.
352
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
353
+
354
+ # get the sines
355
+ sines = torch.cos(i_phase * 2 * np.pi)
356
+ return sines
357
+
358
+ def forward(self, f0):
359
+ """ sine_tensor, uv = forward(f0)
360
+ input F0: tensor(batchsize=1, length, dim=1)
361
+ f0 for unvoiced steps should be 0
362
+ output sine_tensor: tensor(batchsize=1, length, dim)
363
+ output uv: tensor(batchsize=1, length, 1)
364
+ """
365
+ # fundamental component
366
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
367
+
368
+ # generate sine waveforms
369
+ sine_waves = self._f02sine(fn) * self.sine_amp
370
+
371
+ # generate uv signal
372
+ uv = self._f02uv(f0)
373
+
374
+ # noise: for unvoiced should be similar to sine_amp
375
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
376
+ # . for voiced regions is self.noise_std
377
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
378
+ noise = noise_amp * torch.randn_like(sine_waves)
379
+
380
+ # first: set the unvoiced part to 0 by uv
381
+ # then: additive noise
382
+ sine_waves = sine_waves * uv + noise
383
+ return sine_waves, uv, noise
384
+
385
+
386
+ class SourceModuleHnNSF2(torch.nn.Module):
387
+ """ SourceModule for hn-nsf
388
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
389
+ add_noise_std=0.003, voiced_threshod=0)
390
+ sampling_rate: sampling_rate in Hz
391
+ harmonic_num: number of harmonic above F0 (default: 0)
392
+ sine_amp: amplitude of sine source signal (default: 0.1)
393
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
394
+ note that amplitude of noise in unvoiced is decided
395
+ by sine_amp
396
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
397
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
398
+ F0_sampled (batchsize, length, 1)
399
+ Sine_source (batchsize, length, 1)
400
+ noise_source (batchsize, length 1)
401
+ uv (batchsize, length, 1)
402
+ """
403
+
404
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
405
+ add_noise_std=0.003, voiced_threshod=0):
406
+ super(SourceModuleHnNSF2, self).__init__()
407
+
408
+ self.sine_amp = sine_amp
409
+ self.noise_std = add_noise_std
410
+
411
+ # to produce sine waveforms
412
+ self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
413
+ sine_amp, add_noise_std, voiced_threshod)
414
+
415
+ # to merge source harmonics into a single excitation
416
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
417
+ self.l_tanh = torch.nn.Tanh()
418
+
419
+ def forward(self, x):
420
+ """
421
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
422
+ F0_sampled (batchsize, length, 1)
423
+ Sine_source (batchsize, length, 1)
424
+ noise_source (batchsize, length 1)
425
+ """
426
+ # source for harmonic branch
427
+ with torch.no_grad():
428
+ sine_wavs, uv, _ = self.l_sin_gen(x)
429
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
430
+
431
+ # source for noise branch, in the same shape as uv
432
+ noise = torch.randn_like(uv) * self.sine_amp / 3
433
+ return sine_merge, noise, uv