minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. cosyvoice/__init__.py +17 -0
  2. cosyvoice/bin/average_model.py +93 -0
  3. cosyvoice/bin/export_jit.py +103 -0
  4. cosyvoice/bin/export_onnx.py +120 -0
  5. cosyvoice/bin/inference_deprecated.py +126 -0
  6. cosyvoice/bin/train.py +195 -0
  7. cosyvoice/cli/__init__.py +0 -0
  8. cosyvoice/cli/cosyvoice.py +209 -0
  9. cosyvoice/cli/frontend.py +238 -0
  10. cosyvoice/cli/model.py +386 -0
  11. cosyvoice/dataset/__init__.py +0 -0
  12. cosyvoice/dataset/dataset.py +151 -0
  13. cosyvoice/dataset/processor.py +434 -0
  14. cosyvoice/flow/decoder.py +494 -0
  15. cosyvoice/flow/flow.py +281 -0
  16. cosyvoice/flow/flow_matching.py +227 -0
  17. cosyvoice/flow/length_regulator.py +70 -0
  18. cosyvoice/hifigan/discriminator.py +230 -0
  19. cosyvoice/hifigan/f0_predictor.py +58 -0
  20. cosyvoice/hifigan/generator.py +582 -0
  21. cosyvoice/hifigan/hifigan.py +67 -0
  22. cosyvoice/llm/llm.py +610 -0
  23. cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  24. cosyvoice/tokenizer/tokenizer.py +279 -0
  25. cosyvoice/transformer/__init__.py +0 -0
  26. cosyvoice/transformer/activation.py +84 -0
  27. cosyvoice/transformer/attention.py +330 -0
  28. cosyvoice/transformer/convolution.py +145 -0
  29. cosyvoice/transformer/decoder.py +396 -0
  30. cosyvoice/transformer/decoder_layer.py +132 -0
  31. cosyvoice/transformer/embedding.py +302 -0
  32. cosyvoice/transformer/encoder.py +474 -0
  33. cosyvoice/transformer/encoder_layer.py +236 -0
  34. cosyvoice/transformer/label_smoothing_loss.py +96 -0
  35. cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  36. cosyvoice/transformer/subsampling.py +383 -0
  37. cosyvoice/transformer/upsample_encoder.py +320 -0
  38. cosyvoice/utils/__init__.py +0 -0
  39. cosyvoice/utils/class_utils.py +83 -0
  40. cosyvoice/utils/common.py +186 -0
  41. cosyvoice/utils/executor.py +176 -0
  42. cosyvoice/utils/file_utils.py +129 -0
  43. cosyvoice/utils/frontend_utils.py +136 -0
  44. cosyvoice/utils/losses.py +57 -0
  45. cosyvoice/utils/mask.py +265 -0
  46. cosyvoice/utils/scheduler.py +738 -0
  47. cosyvoice/utils/train_utils.py +367 -0
  48. cosyvoice/vllm/cosyvoice2.py +103 -0
  49. matcha/__init__.py +0 -0
  50. matcha/app.py +357 -0
  51. matcha/cli.py +418 -0
  52. matcha/hifigan/__init__.py +0 -0
  53. matcha/hifigan/config.py +28 -0
  54. matcha/hifigan/denoiser.py +64 -0
  55. matcha/hifigan/env.py +17 -0
  56. matcha/hifigan/meldataset.py +217 -0
  57. matcha/hifigan/models.py +368 -0
  58. matcha/hifigan/xutils.py +60 -0
  59. matcha/models/__init__.py +0 -0
  60. matcha/models/baselightningmodule.py +209 -0
  61. matcha/models/components/__init__.py +0 -0
  62. matcha/models/components/decoder.py +443 -0
  63. matcha/models/components/flow_matching.py +132 -0
  64. matcha/models/components/text_encoder.py +410 -0
  65. matcha/models/components/transformer.py +316 -0
  66. matcha/models/matcha_tts.py +239 -0
  67. matcha/onnx/__init__.py +0 -0
  68. matcha/onnx/export.py +181 -0
  69. matcha/onnx/infer.py +168 -0
  70. matcha/text/__init__.py +53 -0
  71. matcha/text/cleaners.py +116 -0
  72. matcha/text/numbers.py +71 -0
  73. matcha/text/symbols.py +17 -0
  74. matcha/train.py +122 -0
  75. matcha/utils/__init__.py +5 -0
  76. matcha/utils/audio.py +82 -0
  77. matcha/utils/generate_data_statistics.py +111 -0
  78. matcha/utils/instantiators.py +56 -0
  79. matcha/utils/logging_utils.py +53 -0
  80. matcha/utils/model.py +90 -0
  81. matcha/utils/monotonic_align/__init__.py +22 -0
  82. matcha/utils/monotonic_align/setup.py +7 -0
  83. matcha/utils/pylogger.py +21 -0
  84. matcha/utils/rich_utils.py +101 -0
  85. matcha/utils/utils.py +219 -0
  86. minicpmo/__init__.py +24 -0
  87. minicpmo/utils.py +636 -0
  88. minicpmo/version.py +2 -0
  89. minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
  90. minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
  91. minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
  92. minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
  93. s3tokenizer/__init__.py +153 -0
  94. s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
  95. s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
  96. s3tokenizer/assets/mel_filters.npz +0 -0
  97. s3tokenizer/cli.py +183 -0
  98. s3tokenizer/model.py +546 -0
  99. s3tokenizer/model_v2.py +605 -0
  100. s3tokenizer/utils.py +390 -0
  101. stepaudio2/__init__.py +40 -0
  102. stepaudio2/cosyvoice2/__init__.py +1 -0
  103. stepaudio2/cosyvoice2/flow/__init__.py +0 -0
  104. stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
  105. stepaudio2/cosyvoice2/flow/flow.py +230 -0
  106. stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
  107. stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
  108. stepaudio2/cosyvoice2/transformer/attention.py +328 -0
  109. stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
  110. stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
  111. stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
  112. stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
  113. stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
  114. stepaudio2/cosyvoice2/utils/__init__.py +1 -0
  115. stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
  116. stepaudio2/cosyvoice2/utils/common.py +101 -0
  117. stepaudio2/cosyvoice2/utils/mask.py +49 -0
  118. stepaudio2/flashcosyvoice/__init__.py +0 -0
  119. stepaudio2/flashcosyvoice/cli.py +424 -0
  120. stepaudio2/flashcosyvoice/config.py +80 -0
  121. stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
  122. stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
  123. stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
  124. stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
  125. stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
  126. stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
  127. stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
  128. stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
  129. stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
  130. stepaudio2/flashcosyvoice/modules/flow.py +198 -0
  131. stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
  132. stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
  133. stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
  134. stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
  135. stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
  136. stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
  137. stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
  138. stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
  139. stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
  140. stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
  141. stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
  142. stepaudio2/flashcosyvoice/utils/audio.py +77 -0
  143. stepaudio2/flashcosyvoice/utils/context.py +28 -0
  144. stepaudio2/flashcosyvoice/utils/loader.py +116 -0
  145. stepaudio2/flashcosyvoice/utils/memory.py +19 -0
  146. stepaudio2/stepaudio2.py +204 -0
  147. stepaudio2/token2wav.py +248 -0
  148. stepaudio2/utils.py +91 -0
@@ -0,0 +1,582 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """HIFI-GAN"""
16
+
17
+ from typing import Dict, Optional, List
18
+ import numpy as np
19
+ from scipy.signal import get_window
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from torch.nn import Conv1d
24
+ from torch.nn import ConvTranspose1d
25
+ from torch.nn.utils import remove_weight_norm
26
+ try:
27
+ from torch.nn.utils.parametrizations import weight_norm
28
+ except ImportError:
29
+ from torch.nn.utils import weight_norm
30
+ from torch.distributions.uniform import Uniform
31
+
32
+ from cosyvoice.transformer.activation import Snake
33
+ from cosyvoice.utils.common import get_padding
34
+ from cosyvoice.utils.common import init_weights
35
+
36
+
37
+ """hifigan based generator implementation.
38
+
39
+ This code is modified from https://github.com/jik876/hifi-gan
40
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
41
+ https://github.com/NVIDIA/BigVGAN
42
+
43
+ """
44
+
45
+
46
+ class ResBlock(torch.nn.Module):
47
+ """Residual block module in HiFiGAN/BigVGAN."""
48
+ def __init__(
49
+ self,
50
+ channels: int = 512,
51
+ kernel_size: int = 3,
52
+ dilations: List[int] = [1, 3, 5],
53
+ ):
54
+ super(ResBlock, self).__init__()
55
+ self.convs1 = nn.ModuleList()
56
+ self.convs2 = nn.ModuleList()
57
+
58
+ for dilation in dilations:
59
+ self.convs1.append(
60
+ weight_norm(
61
+ Conv1d(
62
+ channels,
63
+ channels,
64
+ kernel_size,
65
+ 1,
66
+ dilation=dilation,
67
+ padding=get_padding(kernel_size, dilation)
68
+ )
69
+ )
70
+ )
71
+ self.convs2.append(
72
+ weight_norm(
73
+ Conv1d(
74
+ channels,
75
+ channels,
76
+ kernel_size,
77
+ 1,
78
+ dilation=1,
79
+ padding=get_padding(kernel_size, 1)
80
+ )
81
+ )
82
+ )
83
+ self.convs1.apply(init_weights)
84
+ self.convs2.apply(init_weights)
85
+ self.activations1 = nn.ModuleList([
86
+ Snake(channels, alpha_logscale=False)
87
+ for _ in range(len(self.convs1))
88
+ ])
89
+ self.activations2 = nn.ModuleList([
90
+ Snake(channels, alpha_logscale=False)
91
+ for _ in range(len(self.convs2))
92
+ ])
93
+
94
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
95
+ for idx in range(len(self.convs1)):
96
+ xt = self.activations1[idx](x)
97
+ xt = self.convs1[idx](xt)
98
+ xt = self.activations2[idx](xt)
99
+ xt = self.convs2[idx](xt)
100
+ x = xt + x
101
+ return x
102
+
103
+ def remove_weight_norm(self):
104
+ for idx in range(len(self.convs1)):
105
+ remove_weight_norm(self.convs1[idx])
106
+ remove_weight_norm(self.convs2[idx])
107
+
108
+
109
+ class SineGen(torch.nn.Module):
110
+ """ Definition of sine generator
111
+ SineGen(samp_rate, harmonic_num = 0,
112
+ sine_amp = 0.1, noise_std = 0.003,
113
+ voiced_threshold = 0,
114
+ flag_for_pulse=False)
115
+ samp_rate: sampling rate in Hz
116
+ harmonic_num: number of harmonic overtones (default 0)
117
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
118
+ noise_std: std of Gaussian noise (default 0.003)
119
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
120
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
121
+ Note: when flag_for_pulse is True, the first time step of a voiced
122
+ segment is always sin(np.pi) or cos(0)
123
+ """
124
+
125
+ def __init__(self, samp_rate, harmonic_num=0,
126
+ sine_amp=0.1, noise_std=0.003,
127
+ voiced_threshold=0):
128
+ super(SineGen, self).__init__()
129
+ self.sine_amp = sine_amp
130
+ self.noise_std = noise_std
131
+ self.harmonic_num = harmonic_num
132
+ self.sampling_rate = samp_rate
133
+ self.voiced_threshold = voiced_threshold
134
+
135
+ def _f02uv(self, f0):
136
+ # generate uv signal
137
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
138
+ return uv
139
+
140
+ @torch.no_grad()
141
+ def forward(self, f0):
142
+ """
143
+ :param f0: [B, 1, sample_len], Hz
144
+ :return: [B, 1, sample_len]
145
+ """
146
+
147
+ F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
148
+ for i in range(self.harmonic_num + 1):
149
+ F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
150
+
151
+ theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
152
+ u_dist = Uniform(low=-np.pi, high=np.pi)
153
+ phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
154
+ phase_vec[:, 0, :] = 0
155
+
156
+ # generate sine waveforms
157
+ sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
158
+
159
+ # generate uv signal
160
+ uv = self._f02uv(f0)
161
+
162
+ # noise: for unvoiced should be similar to sine_amp
163
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
164
+ # . for voiced regions is self.noise_std
165
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
166
+ noise = noise_amp * torch.randn_like(sine_waves)
167
+
168
+ # first: set the unvoiced part to 0 by uv
169
+ # then: additive noise
170
+ sine_waves = sine_waves * uv + noise
171
+ return sine_waves, uv, noise
172
+
173
+
174
+ class SourceModuleHnNSF(torch.nn.Module):
175
+ """ SourceModule for hn-nsf
176
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
177
+ add_noise_std=0.003, voiced_threshod=0)
178
+ sampling_rate: sampling_rate in Hz
179
+ harmonic_num: number of harmonic above F0 (default: 0)
180
+ sine_amp: amplitude of sine source signal (default: 0.1)
181
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
182
+ note that amplitude of noise in unvoiced is decided
183
+ by sine_amp
184
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
185
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
186
+ F0_sampled (batchsize, length, 1)
187
+ Sine_source (batchsize, length, 1)
188
+ noise_source (batchsize, length 1)
189
+ uv (batchsize, length, 1)
190
+ """
191
+
192
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
193
+ add_noise_std=0.003, voiced_threshod=0):
194
+ super(SourceModuleHnNSF, self).__init__()
195
+
196
+ self.sine_amp = sine_amp
197
+ self.noise_std = add_noise_std
198
+
199
+ # to produce sine waveforms
200
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
201
+ sine_amp, add_noise_std, voiced_threshod)
202
+
203
+ # to merge source harmonics into a single excitation
204
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
205
+ self.l_tanh = torch.nn.Tanh()
206
+
207
+ def forward(self, x):
208
+ """
209
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
210
+ F0_sampled (batchsize, length, 1)
211
+ Sine_source (batchsize, length, 1)
212
+ noise_source (batchsize, length 1)
213
+ """
214
+ # source for harmonic branch
215
+ with torch.no_grad():
216
+ sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
217
+ sine_wavs = sine_wavs.transpose(1, 2)
218
+ uv = uv.transpose(1, 2)
219
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
220
+
221
+ # source for noise branch, in the same shape as uv
222
+ noise = torch.randn_like(uv) * self.sine_amp / 3
223
+ return sine_merge, noise, uv
224
+
225
+
226
+ class SineGen2(torch.nn.Module):
227
+ """ Definition of sine generator
228
+ SineGen(samp_rate, harmonic_num = 0,
229
+ sine_amp = 0.1, noise_std = 0.003,
230
+ voiced_threshold = 0,
231
+ flag_for_pulse=False)
232
+ samp_rate: sampling rate in Hz
233
+ harmonic_num: number of harmonic overtones (default 0)
234
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
235
+ noise_std: std of Gaussian noise (default 0.003)
236
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
237
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
238
+ Note: when flag_for_pulse is True, the first time step of a voiced
239
+ segment is always sin(np.pi) or cos(0)
240
+ """
241
+
242
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
243
+ sine_amp=0.1, noise_std=0.003,
244
+ voiced_threshold=0,
245
+ flag_for_pulse=False):
246
+ super(SineGen2, self).__init__()
247
+ self.sine_amp = sine_amp
248
+ self.noise_std = noise_std
249
+ self.harmonic_num = harmonic_num
250
+ self.dim = self.harmonic_num + 1
251
+ self.sampling_rate = samp_rate
252
+ self.voiced_threshold = voiced_threshold
253
+ self.flag_for_pulse = flag_for_pulse
254
+ self.upsample_scale = upsample_scale
255
+
256
+ def _f02uv(self, f0):
257
+ # generate uv signal
258
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
259
+ return uv
260
+
261
+ def _f02sine(self, f0_values):
262
+ """ f0_values: (batchsize, length, dim)
263
+ where dim indicates fundamental tone and overtones
264
+ """
265
+ # convert to F0 in rad. The interger part n can be ignored
266
+ # because 2 * np.pi * n doesn't affect phase
267
+ rad_values = (f0_values / self.sampling_rate) % 1
268
+
269
+ # initial phase noise (no noise for fundamental component)
270
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
271
+ rand_ini[:, 0] = 0
272
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
273
+
274
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
275
+ if not self.flag_for_pulse:
276
+ rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
277
+ scale_factor=1 / self.upsample_scale,
278
+ mode="linear").transpose(1, 2)
279
+
280
+ phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
281
+ phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
282
+ scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
283
+ sines = torch.sin(phase)
284
+ else:
285
+ # If necessary, make sure that the first time step of every
286
+ # voiced segments is sin(pi) or cos(0)
287
+ # This is used for pulse-train generation
288
+
289
+ # identify the last time step in unvoiced segments
290
+ uv = self._f02uv(f0_values)
291
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
292
+ uv_1[:, -1, :] = 1
293
+ u_loc = (uv < 1) * (uv_1 > 0)
294
+
295
+ # get the instantanouse phase
296
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
297
+ # different batch needs to be processed differently
298
+ for idx in range(f0_values.shape[0]):
299
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
300
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
301
+ # stores the accumulation of i.phase within
302
+ # each voiced segments
303
+ tmp_cumsum[idx, :, :] = 0
304
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
305
+
306
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
307
+ # within the previous voiced segment.
308
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
309
+
310
+ # get the sines
311
+ sines = torch.cos(i_phase * 2 * np.pi)
312
+ return sines
313
+
314
+ def forward(self, f0):
315
+ """ sine_tensor, uv = forward(f0)
316
+ input F0: tensor(batchsize=1, length, dim=1)
317
+ f0 for unvoiced steps should be 0
318
+ output sine_tensor: tensor(batchsize=1, length, dim)
319
+ output uv: tensor(batchsize=1, length, 1)
320
+ """
321
+ # fundamental component
322
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
323
+
324
+ # generate sine waveforms
325
+ sine_waves = self._f02sine(fn) * self.sine_amp
326
+
327
+ # generate uv signal
328
+ uv = self._f02uv(f0)
329
+
330
+ # noise: for unvoiced should be similar to sine_amp
331
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
332
+ # . for voiced regions is self.noise_std
333
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
334
+ noise = noise_amp * torch.randn_like(sine_waves)
335
+
336
+ # first: set the unvoiced part to 0 by uv
337
+ # then: additive noise
338
+ sine_waves = sine_waves * uv + noise
339
+ return sine_waves, uv, noise
340
+
341
+
342
+ class SourceModuleHnNSF2(torch.nn.Module):
343
+ """ SourceModule for hn-nsf
344
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
345
+ add_noise_std=0.003, voiced_threshod=0)
346
+ sampling_rate: sampling_rate in Hz
347
+ harmonic_num: number of harmonic above F0 (default: 0)
348
+ sine_amp: amplitude of sine source signal (default: 0.1)
349
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
350
+ note that amplitude of noise in unvoiced is decided
351
+ by sine_amp
352
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
353
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
354
+ F0_sampled (batchsize, length, 1)
355
+ Sine_source (batchsize, length, 1)
356
+ noise_source (batchsize, length 1)
357
+ uv (batchsize, length, 1)
358
+ """
359
+
360
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
361
+ add_noise_std=0.003, voiced_threshod=0):
362
+ super(SourceModuleHnNSF2, self).__init__()
363
+
364
+ self.sine_amp = sine_amp
365
+ self.noise_std = add_noise_std
366
+
367
+ # to produce sine waveforms
368
+ self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
369
+ sine_amp, add_noise_std, voiced_threshod)
370
+
371
+ # to merge source harmonics into a single excitation
372
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
373
+ self.l_tanh = torch.nn.Tanh()
374
+
375
+ def forward(self, x):
376
+ """
377
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
378
+ F0_sampled (batchsize, length, 1)
379
+ Sine_source (batchsize, length, 1)
380
+ noise_source (batchsize, length 1)
381
+ """
382
+ # source for harmonic branch
383
+ with torch.no_grad():
384
+ sine_wavs, uv, _ = self.l_sin_gen(x)
385
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
386
+
387
+ # source for noise branch, in the same shape as uv
388
+ noise = torch.randn_like(uv) * self.sine_amp / 3
389
+ return sine_merge, noise, uv
390
+
391
+
392
+ class HiFTGenerator(nn.Module):
393
+ """
394
+ HiFTNet Generator: Neural Source Filter + ISTFTNet
395
+ https://arxiv.org/abs/2309.09493
396
+ """
397
+ def __init__(
398
+ self,
399
+ in_channels: int = 80,
400
+ base_channels: int = 512,
401
+ nb_harmonics: int = 8,
402
+ sampling_rate: int = 22050,
403
+ nsf_alpha: float = 0.1,
404
+ nsf_sigma: float = 0.003,
405
+ nsf_voiced_threshold: float = 10,
406
+ upsample_rates: List[int] = [8, 8],
407
+ upsample_kernel_sizes: List[int] = [16, 16],
408
+ istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
409
+ resblock_kernel_sizes: List[int] = [3, 7, 11],
410
+ resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
411
+ source_resblock_kernel_sizes: List[int] = [7, 11],
412
+ source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
413
+ lrelu_slope: float = 0.1,
414
+ audio_limit: float = 0.99,
415
+ f0_predictor: torch.nn.Module = None,
416
+ ):
417
+ super(HiFTGenerator, self).__init__()
418
+
419
+ self.out_channels = 1
420
+ self.nb_harmonics = nb_harmonics
421
+ self.sampling_rate = sampling_rate
422
+ self.istft_params = istft_params
423
+ self.lrelu_slope = lrelu_slope
424
+ self.audio_limit = audio_limit
425
+
426
+ self.num_kernels = len(resblock_kernel_sizes)
427
+ self.num_upsamples = len(upsample_rates)
428
+ # NOTE in CosyVoice2, we use the original SourceModuleHnNSF implementation
429
+ this_SourceModuleHnNSF = SourceModuleHnNSF if self.sampling_rate == 22050 else SourceModuleHnNSF2
430
+ self.m_source = this_SourceModuleHnNSF(
431
+ sampling_rate=sampling_rate,
432
+ upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
433
+ harmonic_num=nb_harmonics,
434
+ sine_amp=nsf_alpha,
435
+ add_noise_std=nsf_sigma,
436
+ voiced_threshod=nsf_voiced_threshold)
437
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
438
+
439
+ self.conv_pre = weight_norm(
440
+ Conv1d(in_channels, base_channels, 7, 1, padding=3)
441
+ )
442
+
443
+ # Up
444
+ self.ups = nn.ModuleList()
445
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
446
+ self.ups.append(
447
+ weight_norm(
448
+ ConvTranspose1d(
449
+ base_channels // (2**i),
450
+ base_channels // (2**(i + 1)),
451
+ k,
452
+ u,
453
+ padding=(k - u) // 2,
454
+ )
455
+ )
456
+ )
457
+
458
+ # Down
459
+ self.source_downs = nn.ModuleList()
460
+ self.source_resblocks = nn.ModuleList()
461
+ downsample_rates = [1] + upsample_rates[::-1][:-1]
462
+ downsample_cum_rates = np.cumprod(downsample_rates)
463
+ for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
464
+ if u == 1:
465
+ self.source_downs.append(
466
+ Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
467
+ )
468
+ else:
469
+ self.source_downs.append(
470
+ Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
471
+ )
472
+
473
+ self.source_resblocks.append(
474
+ ResBlock(base_channels // (2 ** (i + 1)), k, d)
475
+ )
476
+
477
+ self.resblocks = nn.ModuleList()
478
+ for i in range(len(self.ups)):
479
+ ch = base_channels // (2**(i + 1))
480
+ for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
481
+ self.resblocks.append(ResBlock(ch, k, d))
482
+
483
+ self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
484
+ self.ups.apply(init_weights)
485
+ self.conv_post.apply(init_weights)
486
+ self.reflection_pad = nn.ReflectionPad1d((1, 0))
487
+ self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
488
+ self.f0_predictor = f0_predictor
489
+
490
+ def remove_weight_norm(self):
491
+ print('Removing weight norm...')
492
+ for l in self.ups:
493
+ remove_weight_norm(l)
494
+ for l in self.resblocks:
495
+ l.remove_weight_norm()
496
+ remove_weight_norm(self.conv_pre)
497
+ remove_weight_norm(self.conv_post)
498
+ self.m_source.remove_weight_norm()
499
+ for l in self.source_downs:
500
+ remove_weight_norm(l)
501
+ for l in self.source_resblocks:
502
+ l.remove_weight_norm()
503
+
504
+ def _stft(self, x):
505
+ spec = torch.stft(
506
+ x,
507
+ self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
508
+ return_complex=True)
509
+ spec = torch.view_as_real(spec) # [B, F, TT, 2]
510
+ return spec[..., 0], spec[..., 1]
511
+
512
+ def _istft(self, magnitude, phase):
513
+ magnitude = torch.clip(magnitude, max=1e2)
514
+ real = magnitude * torch.cos(phase)
515
+ img = magnitude * torch.sin(phase)
516
+ inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
517
+ self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
518
+ return inverse_transform
519
+
520
+ def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
521
+ s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
522
+ s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
523
+
524
+ x = self.conv_pre(x)
525
+ for i in range(self.num_upsamples):
526
+ x = F.leaky_relu(x, self.lrelu_slope)
527
+ x = self.ups[i](x)
528
+
529
+ if i == self.num_upsamples - 1:
530
+ x = self.reflection_pad(x)
531
+
532
+ # fusion
533
+ si = self.source_downs[i](s_stft)
534
+ si = self.source_resblocks[i](si)
535
+ x = x + si
536
+
537
+ xs = None
538
+ for j in range(self.num_kernels):
539
+ if xs is None:
540
+ xs = self.resblocks[i * self.num_kernels + j](x)
541
+ else:
542
+ xs += self.resblocks[i * self.num_kernels + j](x)
543
+ x = xs / self.num_kernels
544
+
545
+ x = F.leaky_relu(x)
546
+ x = self.conv_post(x)
547
+ magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
548
+ phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy
549
+
550
+ x = self._istft(magnitude, phase)
551
+ x = torch.clamp(x, -self.audio_limit, self.audio_limit)
552
+ return x
553
+
554
+ def forward(
555
+ self,
556
+ batch: dict,
557
+ device: torch.device,
558
+ ) -> Dict[str, Optional[torch.Tensor]]:
559
+ speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
560
+ # mel->f0
561
+ f0 = self.f0_predictor(speech_feat)
562
+ # f0->source
563
+ s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
564
+ s, _, _ = self.m_source(s)
565
+ s = s.transpose(1, 2)
566
+ # mel+source->speech
567
+ generated_speech = self.decode(x=speech_feat, s=s)
568
+ return generated_speech, f0
569
+
570
+ @torch.inference_mode()
571
+ def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
572
+ # mel->f0
573
+ f0 = self.f0_predictor(speech_feat)
574
+ # f0->source
575
+ s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
576
+ s, _, _ = self.m_source(s)
577
+ s = s.transpose(1, 2)
578
+ # use cache_source to avoid glitch
579
+ if cache_source.shape[2] != 0:
580
+ s[:, :, :cache_source.shape[2]] = cache_source
581
+ generated_speech = self.decode(x=speech_feat, s=s)
582
+ return generated_speech, s
@@ -0,0 +1,67 @@
1
+ from typing import Dict, Optional
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
6
+ from cosyvoice.utils.losses import tpr_loss, mel_loss
7
+
8
+
9
+ class HiFiGan(nn.Module):
10
+ def __init__(self, generator, discriminator, mel_spec_transform,
11
+ multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
12
+ tpr_loss_weight=1.0, tpr_loss_tau=0.04):
13
+ super(HiFiGan, self).__init__()
14
+ self.generator = generator
15
+ self.discriminator = discriminator
16
+ self.mel_spec_transform = mel_spec_transform
17
+ self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
18
+ self.feat_match_loss_weight = feat_match_loss_weight
19
+ self.tpr_loss_weight = tpr_loss_weight
20
+ self.tpr_loss_tau = tpr_loss_tau
21
+
22
+ def forward(
23
+ self,
24
+ batch: dict,
25
+ device: torch.device,
26
+ ) -> Dict[str, Optional[torch.Tensor]]:
27
+ if batch['turn'] == 'generator':
28
+ return self.forward_generator(batch, device)
29
+ else:
30
+ return self.forward_discriminator(batch, device)
31
+
32
+ def forward_generator(self, batch, device):
33
+ real_speech = batch['speech'].to(device)
34
+ pitch_feat = batch['pitch_feat'].to(device)
35
+ # 1. calculate generator outputs
36
+ generated_speech, generated_f0 = self.generator(batch, device)
37
+ # 2. calculate discriminator outputs
38
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
39
+ # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
40
+ loss_gen, _ = generator_loss(y_d_gs)
41
+ loss_fm = feature_loss(fmap_rs, fmap_gs)
42
+ loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
43
+ if self.tpr_loss_weight != 0:
44
+ loss_tpr = tpr_loss(y_d_gs, y_d_rs, self.tpr_loss_tau)
45
+ else:
46
+ loss_tpr = torch.zeros(1).to(device)
47
+ loss_f0 = F.l1_loss(generated_f0, pitch_feat)
48
+ loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
49
+ self.multi_mel_spectral_recon_loss_weight * loss_mel + \
50
+ self.tpr_loss_weight * loss_tpr + loss_f0
51
+ return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
52
+
53
+ def forward_discriminator(self, batch, device):
54
+ real_speech = batch['speech'].to(device)
55
+ # 1. calculate generator outputs
56
+ with torch.no_grad():
57
+ generated_speech, generated_f0 = self.generator(batch, device)
58
+ # 2. calculate discriminator outputs
59
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech.detach())
60
+ # 3. calculate discriminator losses, tpr losses [Optional]
61
+ loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
62
+ if self.tpr_loss_weight != 0:
63
+ loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
64
+ else:
65
+ loss_tpr = torch.zeros(1).to(device)
66
+ loss = loss_disc + self.tpr_loss_weight * loss_tpr
67
+ return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}