minicpmo-utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosyvoice/__init__.py +17 -0
- cosyvoice/bin/average_model.py +93 -0
- cosyvoice/bin/export_jit.py +103 -0
- cosyvoice/bin/export_onnx.py +120 -0
- cosyvoice/bin/inference_deprecated.py +126 -0
- cosyvoice/bin/train.py +195 -0
- cosyvoice/cli/__init__.py +0 -0
- cosyvoice/cli/cosyvoice.py +209 -0
- cosyvoice/cli/frontend.py +238 -0
- cosyvoice/cli/model.py +386 -0
- cosyvoice/dataset/__init__.py +0 -0
- cosyvoice/dataset/dataset.py +151 -0
- cosyvoice/dataset/processor.py +434 -0
- cosyvoice/flow/decoder.py +494 -0
- cosyvoice/flow/flow.py +281 -0
- cosyvoice/flow/flow_matching.py +227 -0
- cosyvoice/flow/length_regulator.py +70 -0
- cosyvoice/hifigan/discriminator.py +230 -0
- cosyvoice/hifigan/f0_predictor.py +58 -0
- cosyvoice/hifigan/generator.py +582 -0
- cosyvoice/hifigan/hifigan.py +67 -0
- cosyvoice/llm/llm.py +610 -0
- cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
- cosyvoice/tokenizer/tokenizer.py +279 -0
- cosyvoice/transformer/__init__.py +0 -0
- cosyvoice/transformer/activation.py +84 -0
- cosyvoice/transformer/attention.py +330 -0
- cosyvoice/transformer/convolution.py +145 -0
- cosyvoice/transformer/decoder.py +396 -0
- cosyvoice/transformer/decoder_layer.py +132 -0
- cosyvoice/transformer/embedding.py +302 -0
- cosyvoice/transformer/encoder.py +474 -0
- cosyvoice/transformer/encoder_layer.py +236 -0
- cosyvoice/transformer/label_smoothing_loss.py +96 -0
- cosyvoice/transformer/positionwise_feed_forward.py +115 -0
- cosyvoice/transformer/subsampling.py +383 -0
- cosyvoice/transformer/upsample_encoder.py +320 -0
- cosyvoice/utils/__init__.py +0 -0
- cosyvoice/utils/class_utils.py +83 -0
- cosyvoice/utils/common.py +186 -0
- cosyvoice/utils/executor.py +176 -0
- cosyvoice/utils/file_utils.py +129 -0
- cosyvoice/utils/frontend_utils.py +136 -0
- cosyvoice/utils/losses.py +57 -0
- cosyvoice/utils/mask.py +265 -0
- cosyvoice/utils/scheduler.py +738 -0
- cosyvoice/utils/train_utils.py +367 -0
- cosyvoice/vllm/cosyvoice2.py +103 -0
- matcha/__init__.py +0 -0
- matcha/app.py +357 -0
- matcha/cli.py +418 -0
- matcha/hifigan/__init__.py +0 -0
- matcha/hifigan/config.py +28 -0
- matcha/hifigan/denoiser.py +64 -0
- matcha/hifigan/env.py +17 -0
- matcha/hifigan/meldataset.py +217 -0
- matcha/hifigan/models.py +368 -0
- matcha/hifigan/xutils.py +60 -0
- matcha/models/__init__.py +0 -0
- matcha/models/baselightningmodule.py +209 -0
- matcha/models/components/__init__.py +0 -0
- matcha/models/components/decoder.py +443 -0
- matcha/models/components/flow_matching.py +132 -0
- matcha/models/components/text_encoder.py +410 -0
- matcha/models/components/transformer.py +316 -0
- matcha/models/matcha_tts.py +239 -0
- matcha/onnx/__init__.py +0 -0
- matcha/onnx/export.py +181 -0
- matcha/onnx/infer.py +168 -0
- matcha/text/__init__.py +53 -0
- matcha/text/cleaners.py +116 -0
- matcha/text/numbers.py +71 -0
- matcha/text/symbols.py +17 -0
- matcha/train.py +122 -0
- matcha/utils/__init__.py +5 -0
- matcha/utils/audio.py +82 -0
- matcha/utils/generate_data_statistics.py +111 -0
- matcha/utils/instantiators.py +56 -0
- matcha/utils/logging_utils.py +53 -0
- matcha/utils/model.py +90 -0
- matcha/utils/monotonic_align/__init__.py +22 -0
- matcha/utils/monotonic_align/setup.py +7 -0
- matcha/utils/pylogger.py +21 -0
- matcha/utils/rich_utils.py +101 -0
- matcha/utils/utils.py +219 -0
- minicpmo/__init__.py +24 -0
- minicpmo/utils.py +636 -0
- minicpmo/version.py +2 -0
- minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
- minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
- minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
- minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
- s3tokenizer/__init__.py +153 -0
- s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
- s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
- s3tokenizer/assets/mel_filters.npz +0 -0
- s3tokenizer/cli.py +183 -0
- s3tokenizer/model.py +546 -0
- s3tokenizer/model_v2.py +605 -0
- s3tokenizer/utils.py +390 -0
- stepaudio2/__init__.py +40 -0
- stepaudio2/cosyvoice2/__init__.py +1 -0
- stepaudio2/cosyvoice2/flow/__init__.py +0 -0
- stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
- stepaudio2/cosyvoice2/flow/flow.py +230 -0
- stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
- stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
- stepaudio2/cosyvoice2/transformer/attention.py +328 -0
- stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
- stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
- stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
- stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
- stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
- stepaudio2/cosyvoice2/utils/__init__.py +1 -0
- stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
- stepaudio2/cosyvoice2/utils/common.py +101 -0
- stepaudio2/cosyvoice2/utils/mask.py +49 -0
- stepaudio2/flashcosyvoice/__init__.py +0 -0
- stepaudio2/flashcosyvoice/cli.py +424 -0
- stepaudio2/flashcosyvoice/config.py +80 -0
- stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
- stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
- stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
- stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
- stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
- stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
- stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
- stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
- stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/flow.py +198 -0
- stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
- stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
- stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
- stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
- stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
- stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
- stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
- stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
- stepaudio2/flashcosyvoice/utils/audio.py +77 -0
- stepaudio2/flashcosyvoice/utils/context.py +28 -0
- stepaudio2/flashcosyvoice/utils/loader.py +116 -0
- stepaudio2/flashcosyvoice/utils/memory.py +19 -0
- stepaudio2/stepaudio2.py +204 -0
- stepaudio2/token2wav.py +248 -0
- stepaudio2/utils.py +91 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""HIFI-GAN"""
|
|
16
|
+
|
|
17
|
+
from typing import Dict, List
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import torch
|
|
21
|
+
import torch.nn as nn
|
|
22
|
+
import torch.nn.functional as F
|
|
23
|
+
from scipy.signal import get_window
|
|
24
|
+
from torch.nn import Conv1d, ConvTranspose1d
|
|
25
|
+
from torch.nn.utils import remove_weight_norm
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from torch.nn.utils.parametrizations import weight_norm
|
|
29
|
+
except ImportError:
|
|
30
|
+
from torch.nn.utils import weight_norm # noqa
|
|
31
|
+
|
|
32
|
+
from stepaudio2.flashcosyvoice.modules.hifigan_components.layers import (
|
|
33
|
+
ResBlock, SourceModuleHnNSF, SourceModuleHnNSF2, init_weights)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ConvRNNF0Predictor(nn.Module):
|
|
37
|
+
def __init__(self,
|
|
38
|
+
num_class: int = 1,
|
|
39
|
+
in_channels: int = 80,
|
|
40
|
+
cond_channels: int = 512
|
|
41
|
+
):
|
|
42
|
+
super().__init__()
|
|
43
|
+
|
|
44
|
+
self.num_class = num_class
|
|
45
|
+
self.condnet = nn.Sequential(
|
|
46
|
+
weight_norm( # noqa
|
|
47
|
+
nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
|
|
48
|
+
),
|
|
49
|
+
nn.ELU(),
|
|
50
|
+
weight_norm( # noqa
|
|
51
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
52
|
+
),
|
|
53
|
+
nn.ELU(),
|
|
54
|
+
weight_norm( # noqa
|
|
55
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
56
|
+
),
|
|
57
|
+
nn.ELU(),
|
|
58
|
+
weight_norm( # noqa
|
|
59
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
60
|
+
),
|
|
61
|
+
nn.ELU(),
|
|
62
|
+
weight_norm( # noqa
|
|
63
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
64
|
+
),
|
|
65
|
+
nn.ELU(),
|
|
66
|
+
)
|
|
67
|
+
self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
|
|
68
|
+
|
|
69
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
70
|
+
x = self.condnet(x)
|
|
71
|
+
x = x.transpose(1, 2)
|
|
72
|
+
return torch.abs(self.classifier(x).squeeze(-1))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class HiFTGenerator(nn.Module):
|
|
76
|
+
"""
|
|
77
|
+
HiFTNet Generator: Neural Source Filter + ISTFTNet
|
|
78
|
+
https://arxiv.org/abs/2309.09493
|
|
79
|
+
"""
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
in_channels: int = 80,
|
|
83
|
+
base_channels: int = 512,
|
|
84
|
+
nb_harmonics: int = 8,
|
|
85
|
+
sampling_rate: int = 24000,
|
|
86
|
+
nsf_alpha: float = 0.1,
|
|
87
|
+
nsf_sigma: float = 0.003,
|
|
88
|
+
nsf_voiced_threshold: float = 10,
|
|
89
|
+
upsample_rates: List[int] = [8, 5, 3], # noqa
|
|
90
|
+
upsample_kernel_sizes: List[int] = [16, 11, 7], # noqa
|
|
91
|
+
istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4}, # noqa
|
|
92
|
+
resblock_kernel_sizes: List[int] = [3, 7, 11], # noqa
|
|
93
|
+
resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], # noqa
|
|
94
|
+
source_resblock_kernel_sizes: List[int] = [7, 7, 11], # noqa
|
|
95
|
+
source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], # noqa
|
|
96
|
+
lrelu_slope: float = 0.1,
|
|
97
|
+
audio_limit: float = 0.99,
|
|
98
|
+
f0_predictor: torch.nn.Module = None,
|
|
99
|
+
):
|
|
100
|
+
super(HiFTGenerator, self).__init__()
|
|
101
|
+
|
|
102
|
+
self.out_channels = 1
|
|
103
|
+
self.nb_harmonics = nb_harmonics
|
|
104
|
+
self.sampling_rate = sampling_rate
|
|
105
|
+
self.istft_params = istft_params
|
|
106
|
+
self.lrelu_slope = lrelu_slope
|
|
107
|
+
self.audio_limit = audio_limit
|
|
108
|
+
|
|
109
|
+
self.num_kernels = len(resblock_kernel_sizes)
|
|
110
|
+
self.num_upsamples = len(upsample_rates)
|
|
111
|
+
# NOTE in CosyVoice2, we use the original SourceModuleHnNSF implementation
|
|
112
|
+
this_SourceModuleHnNSF = SourceModuleHnNSF if self.sampling_rate == 22050 else SourceModuleHnNSF2
|
|
113
|
+
self.m_source = this_SourceModuleHnNSF(
|
|
114
|
+
sampling_rate=sampling_rate,
|
|
115
|
+
upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
|
|
116
|
+
harmonic_num=nb_harmonics,
|
|
117
|
+
sine_amp=nsf_alpha,
|
|
118
|
+
add_noise_std=nsf_sigma,
|
|
119
|
+
voiced_threshod=nsf_voiced_threshold)
|
|
120
|
+
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
|
|
121
|
+
|
|
122
|
+
self.conv_pre = weight_norm( # noqa
|
|
123
|
+
Conv1d(in_channels, base_channels, 7, 1, padding=3)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Up
|
|
127
|
+
self.ups = nn.ModuleList()
|
|
128
|
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
|
129
|
+
self.ups.append(
|
|
130
|
+
weight_norm( # noqa
|
|
131
|
+
ConvTranspose1d(
|
|
132
|
+
base_channels // (2**i),
|
|
133
|
+
base_channels // (2**(i + 1)),
|
|
134
|
+
k,
|
|
135
|
+
u,
|
|
136
|
+
padding=(k - u) // 2,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Down
|
|
142
|
+
self.source_downs = nn.ModuleList()
|
|
143
|
+
self.source_resblocks = nn.ModuleList()
|
|
144
|
+
downsample_rates = [1] + upsample_rates[::-1][:-1]
|
|
145
|
+
downsample_cum_rates = np.cumprod(downsample_rates)
|
|
146
|
+
for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
|
|
147
|
+
if u == 1:
|
|
148
|
+
self.source_downs.append(
|
|
149
|
+
Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
self.source_downs.append(
|
|
153
|
+
Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self.source_resblocks.append(
|
|
157
|
+
ResBlock(base_channels // (2 ** (i + 1)), k, d)
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
self.resblocks = nn.ModuleList()
|
|
161
|
+
for i in range(len(self.ups)):
|
|
162
|
+
ch = base_channels // (2**(i + 1))
|
|
163
|
+
for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
|
164
|
+
self.resblocks.append(ResBlock(ch, k, d))
|
|
165
|
+
|
|
166
|
+
self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3)) # noqa
|
|
167
|
+
self.ups.apply(init_weights)
|
|
168
|
+
self.conv_post.apply(init_weights)
|
|
169
|
+
self.reflection_pad = nn.ReflectionPad1d((1, 0))
|
|
170
|
+
self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
|
|
171
|
+
self.f0_predictor = ConvRNNF0Predictor() if f0_predictor is None else f0_predictor
|
|
172
|
+
|
|
173
|
+
def remove_weight_norm(self):
|
|
174
|
+
print('Removing weight norm...')
|
|
175
|
+
for up in self.ups:
|
|
176
|
+
remove_weight_norm(up)
|
|
177
|
+
for resblock in self.resblocks:
|
|
178
|
+
resblock.remove_weight_norm()
|
|
179
|
+
remove_weight_norm(self.conv_pre)
|
|
180
|
+
remove_weight_norm(self.conv_post)
|
|
181
|
+
self.m_source.remove_weight_norm()
|
|
182
|
+
for source_down in self.source_downs:
|
|
183
|
+
remove_weight_norm(source_down)
|
|
184
|
+
for source_resblock in self.source_resblocks:
|
|
185
|
+
source_resblock.remove_weight_norm()
|
|
186
|
+
|
|
187
|
+
def _stft(self, x):
|
|
188
|
+
spec = torch.stft(
|
|
189
|
+
x,
|
|
190
|
+
self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
|
|
191
|
+
return_complex=True)
|
|
192
|
+
spec = torch.view_as_real(spec) # [B, F, TT, 2]
|
|
193
|
+
return spec[..., 0], spec[..., 1]
|
|
194
|
+
|
|
195
|
+
def _istft(self, magnitude, phase):
|
|
196
|
+
magnitude = torch.clip(magnitude, max=1e2)
|
|
197
|
+
real = magnitude * torch.cos(phase)
|
|
198
|
+
img = magnitude * torch.sin(phase)
|
|
199
|
+
inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
|
|
200
|
+
self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
|
|
201
|
+
return inverse_transform
|
|
202
|
+
|
|
203
|
+
def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
|
|
204
|
+
s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
|
|
205
|
+
s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
|
|
206
|
+
|
|
207
|
+
x = self.conv_pre(x)
|
|
208
|
+
for i in range(self.num_upsamples):
|
|
209
|
+
x = F.leaky_relu(x, self.lrelu_slope)
|
|
210
|
+
x = self.ups[i](x)
|
|
211
|
+
|
|
212
|
+
if i == self.num_upsamples - 1:
|
|
213
|
+
x = self.reflection_pad(x)
|
|
214
|
+
|
|
215
|
+
# fusion
|
|
216
|
+
si = self.source_downs[i](s_stft)
|
|
217
|
+
si = self.source_resblocks[i](si)
|
|
218
|
+
x = x + si
|
|
219
|
+
|
|
220
|
+
xs = None
|
|
221
|
+
for j in range(self.num_kernels):
|
|
222
|
+
if xs is None:
|
|
223
|
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
|
224
|
+
else:
|
|
225
|
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
|
226
|
+
x = xs / self.num_kernels
|
|
227
|
+
|
|
228
|
+
x = F.leaky_relu(x)
|
|
229
|
+
x = self.conv_post(x)
|
|
230
|
+
magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
|
|
231
|
+
phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy
|
|
232
|
+
|
|
233
|
+
x = self._istft(magnitude, phase)
|
|
234
|
+
x = torch.clamp(x, -self.audio_limit, self.audio_limit)
|
|
235
|
+
return x
|
|
236
|
+
|
|
237
|
+
@torch.inference_mode()
|
|
238
|
+
def forward(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
|
|
239
|
+
# mel->f0
|
|
240
|
+
f0 = self.f0_predictor(speech_feat)
|
|
241
|
+
# f0->source
|
|
242
|
+
s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
|
|
243
|
+
s, _, _ = self.m_source(s)
|
|
244
|
+
s = s.transpose(1, 2)
|
|
245
|
+
# use cache_source to avoid glitch
|
|
246
|
+
if cache_source.shape[2] != 0:
|
|
247
|
+
s[:, :, :cache_source.shape[2]] = cache_source
|
|
248
|
+
generated_speech = self.decode(x=speech_feat, s=s)
|
|
249
|
+
return generated_speech, s
|
|
File without changes
|
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import torch
|
|
5
|
+
import torch.nn as nn
|
|
6
|
+
from torch.distributions.uniform import Uniform
|
|
7
|
+
from torch.nn import Conv1d
|
|
8
|
+
from torch.nn.utils import remove_weight_norm
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from torch.nn.utils.parametrizations import weight_norm
|
|
12
|
+
except ImportError:
|
|
13
|
+
from torch.nn.utils import weight_norm # noqa
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_padding(kernel_size, dilation=1):
|
|
17
|
+
return int((kernel_size * dilation - dilation) / 2)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def init_weights(m, mean=0.0, std=0.01):
|
|
21
|
+
classname = m.__class__.__name__
|
|
22
|
+
if classname.find("Conv") != -1:
|
|
23
|
+
m.weight.data.normal_(mean, std)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
"""hifigan based generator implementation.
|
|
27
|
+
|
|
28
|
+
This code is modified from https://github.com/jik876/hifi-gan
|
|
29
|
+
,https://github.com/kan-bayashi/ParallelWaveGAN and
|
|
30
|
+
https://github.com/NVIDIA/BigVGAN
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
|
|
36
|
+
# LICENSE is in incl_licenses directory.
|
|
37
|
+
class Snake(nn.Module):
|
|
38
|
+
'''
|
|
39
|
+
Implementation of a sine-based periodic activation function
|
|
40
|
+
Shape:
|
|
41
|
+
- Input: (B, C, T)
|
|
42
|
+
- Output: (B, C, T), same shape as the input
|
|
43
|
+
Parameters:
|
|
44
|
+
- alpha - trainable parameter
|
|
45
|
+
References:
|
|
46
|
+
- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
|
|
47
|
+
https://arxiv.org/abs/2006.08195
|
|
48
|
+
Examples:
|
|
49
|
+
>>> a1 = snake(256)
|
|
50
|
+
>>> x = torch.randn(256)
|
|
51
|
+
>>> x = a1(x)
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
in_features: shape of the input
|
|
55
|
+
alpha: trainable parameter
|
|
56
|
+
alpha_trainable: whether alpha is trainable
|
|
57
|
+
alpha_logscale: whether to use log scale for alpha
|
|
58
|
+
alpha is initialized to 1 by default, higher values = higher-frequency.
|
|
59
|
+
alpha will be trained along with the rest of your model.
|
|
60
|
+
'''
|
|
61
|
+
def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
|
|
62
|
+
super(Snake, self).__init__()
|
|
63
|
+
self.in_features = in_features
|
|
64
|
+
|
|
65
|
+
# initialize alpha
|
|
66
|
+
self.alpha_logscale = alpha_logscale
|
|
67
|
+
if self.alpha_logscale: # log scale alphas initialized to zeros
|
|
68
|
+
self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
|
|
69
|
+
else: # linear scale alphas initialized to ones
|
|
70
|
+
self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
|
|
71
|
+
|
|
72
|
+
self.alpha.requires_grad = alpha_trainable
|
|
73
|
+
|
|
74
|
+
self.no_div_by_zero = 0.000000001
|
|
75
|
+
|
|
76
|
+
def forward(self, x):
|
|
77
|
+
'''
|
|
78
|
+
Forward pass of the function.
|
|
79
|
+
Applies the function to the input elementwise.
|
|
80
|
+
Snake ∶= x + 1/a * sin^2 (xa)
|
|
81
|
+
'''
|
|
82
|
+
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
|
|
83
|
+
if self.alpha_logscale:
|
|
84
|
+
alpha = torch.exp(alpha)
|
|
85
|
+
x = x + (1.0 / (alpha + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
|
|
86
|
+
|
|
87
|
+
return x
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ResBlock(torch.nn.Module):
|
|
91
|
+
"""Residual block module in HiFiGAN/BigVGAN."""
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
channels: int = 512,
|
|
95
|
+
kernel_size: int = 3,
|
|
96
|
+
dilations: List[int] = [1, 3, 5], # noqa
|
|
97
|
+
):
|
|
98
|
+
super(ResBlock, self).__init__()
|
|
99
|
+
self.convs1 = nn.ModuleList()
|
|
100
|
+
self.convs2 = nn.ModuleList()
|
|
101
|
+
|
|
102
|
+
for dilation in dilations:
|
|
103
|
+
self.convs1.append(
|
|
104
|
+
weight_norm( # noqa
|
|
105
|
+
Conv1d(
|
|
106
|
+
channels,
|
|
107
|
+
channels,
|
|
108
|
+
kernel_size,
|
|
109
|
+
1,
|
|
110
|
+
dilation=dilation,
|
|
111
|
+
padding=get_padding(kernel_size, dilation)
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
self.convs2.append(
|
|
116
|
+
weight_norm( # noqa
|
|
117
|
+
Conv1d(
|
|
118
|
+
channels,
|
|
119
|
+
channels,
|
|
120
|
+
kernel_size,
|
|
121
|
+
1,
|
|
122
|
+
dilation=1,
|
|
123
|
+
padding=get_padding(kernel_size, 1)
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
self.convs1.apply(init_weights)
|
|
128
|
+
self.convs2.apply(init_weights)
|
|
129
|
+
self.activations1 = nn.ModuleList([
|
|
130
|
+
Snake(channels, alpha_logscale=False)
|
|
131
|
+
for _ in range(len(self.convs1))
|
|
132
|
+
])
|
|
133
|
+
self.activations2 = nn.ModuleList([
|
|
134
|
+
Snake(channels, alpha_logscale=False)
|
|
135
|
+
for _ in range(len(self.convs2))
|
|
136
|
+
])
|
|
137
|
+
|
|
138
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
139
|
+
for idx in range(len(self.convs1)):
|
|
140
|
+
xt = self.activations1[idx](x)
|
|
141
|
+
xt = self.convs1[idx](xt)
|
|
142
|
+
xt = self.activations2[idx](xt)
|
|
143
|
+
xt = self.convs2[idx](xt)
|
|
144
|
+
x = xt + x
|
|
145
|
+
return x
|
|
146
|
+
|
|
147
|
+
def remove_weight_norm(self):
|
|
148
|
+
for idx in range(len(self.convs1)):
|
|
149
|
+
remove_weight_norm(self.convs1[idx])
|
|
150
|
+
remove_weight_norm(self.convs2[idx])
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class SineGen(torch.nn.Module):
|
|
154
|
+
""" Definition of sine generator
|
|
155
|
+
SineGen(samp_rate, harmonic_num = 0,
|
|
156
|
+
sine_amp = 0.1, noise_std = 0.003,
|
|
157
|
+
voiced_threshold = 0,
|
|
158
|
+
flag_for_pulse=False)
|
|
159
|
+
samp_rate: sampling rate in Hz
|
|
160
|
+
harmonic_num: number of harmonic overtones (default 0)
|
|
161
|
+
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
|
162
|
+
noise_std: std of Gaussian noise (default 0.003)
|
|
163
|
+
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
|
164
|
+
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
|
165
|
+
Note: when flag_for_pulse is True, the first time step of a voiced
|
|
166
|
+
segment is always sin(np.pi) or cos(0)
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
def __init__(self, samp_rate, harmonic_num=0,
|
|
170
|
+
sine_amp=0.1, noise_std=0.003,
|
|
171
|
+
voiced_threshold=0):
|
|
172
|
+
super(SineGen, self).__init__()
|
|
173
|
+
self.sine_amp = sine_amp
|
|
174
|
+
self.noise_std = noise_std
|
|
175
|
+
self.harmonic_num = harmonic_num
|
|
176
|
+
self.sampling_rate = samp_rate
|
|
177
|
+
self.voiced_threshold = voiced_threshold
|
|
178
|
+
|
|
179
|
+
def _f02uv(self, f0):
|
|
180
|
+
# generate uv signal
|
|
181
|
+
uv = (f0 > self.voiced_threshold).type(torch.float32)
|
|
182
|
+
return uv
|
|
183
|
+
|
|
184
|
+
@torch.no_grad()
|
|
185
|
+
def forward(self, f0):
|
|
186
|
+
"""
|
|
187
|
+
:param f0: [B, 1, sample_len], Hz
|
|
188
|
+
:return: [B, 1, sample_len]
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
|
|
192
|
+
for i in range(self.harmonic_num + 1):
|
|
193
|
+
F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
|
|
194
|
+
|
|
195
|
+
theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
|
|
196
|
+
u_dist = Uniform(low=-np.pi, high=np.pi)
|
|
197
|
+
phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
|
|
198
|
+
phase_vec[:, 0, :] = 0
|
|
199
|
+
|
|
200
|
+
# generate sine waveforms
|
|
201
|
+
sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
|
|
202
|
+
|
|
203
|
+
# generate uv signal
|
|
204
|
+
uv = self._f02uv(f0)
|
|
205
|
+
|
|
206
|
+
# noise: for unvoiced should be similar to sine_amp
|
|
207
|
+
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
|
208
|
+
# . for voiced regions is self.noise_std
|
|
209
|
+
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
|
210
|
+
noise = noise_amp * torch.randn_like(sine_waves)
|
|
211
|
+
|
|
212
|
+
# first: set the unvoiced part to 0 by uv
|
|
213
|
+
# then: additive noise
|
|
214
|
+
sine_waves = sine_waves * uv + noise
|
|
215
|
+
return sine_waves, uv, noise
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class SourceModuleHnNSF(torch.nn.Module):
|
|
219
|
+
""" SourceModule for hn-nsf
|
|
220
|
+
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
|
221
|
+
add_noise_std=0.003, voiced_threshod=0)
|
|
222
|
+
sampling_rate: sampling_rate in Hz
|
|
223
|
+
harmonic_num: number of harmonic above F0 (default: 0)
|
|
224
|
+
sine_amp: amplitude of sine source signal (default: 0.1)
|
|
225
|
+
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
|
226
|
+
note that amplitude of noise in unvoiced is decided
|
|
227
|
+
by sine_amp
|
|
228
|
+
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
|
229
|
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
|
230
|
+
F0_sampled (batchsize, length, 1)
|
|
231
|
+
Sine_source (batchsize, length, 1)
|
|
232
|
+
noise_source (batchsize, length 1)
|
|
233
|
+
uv (batchsize, length, 1)
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
|
|
237
|
+
add_noise_std=0.003, voiced_threshod=0):
|
|
238
|
+
super(SourceModuleHnNSF, self).__init__()
|
|
239
|
+
|
|
240
|
+
self.sine_amp = sine_amp
|
|
241
|
+
self.noise_std = add_noise_std
|
|
242
|
+
|
|
243
|
+
# to produce sine waveforms
|
|
244
|
+
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
|
|
245
|
+
sine_amp, add_noise_std, voiced_threshod)
|
|
246
|
+
|
|
247
|
+
# to merge source harmonics into a single excitation
|
|
248
|
+
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
|
249
|
+
self.l_tanh = torch.nn.Tanh()
|
|
250
|
+
|
|
251
|
+
def forward(self, x):
|
|
252
|
+
"""
|
|
253
|
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
|
254
|
+
F0_sampled (batchsize, length, 1)
|
|
255
|
+
Sine_source (batchsize, length, 1)
|
|
256
|
+
noise_source (batchsize, length 1)
|
|
257
|
+
"""
|
|
258
|
+
# source for harmonic branch
|
|
259
|
+
with torch.no_grad():
|
|
260
|
+
sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
|
|
261
|
+
sine_wavs = sine_wavs.transpose(1, 2)
|
|
262
|
+
uv = uv.transpose(1, 2)
|
|
263
|
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
|
264
|
+
|
|
265
|
+
# source for noise branch, in the same shape as uv
|
|
266
|
+
noise = torch.randn_like(uv) * self.sine_amp / 3
|
|
267
|
+
return sine_merge, noise, uv
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class SineGen2(torch.nn.Module):
|
|
271
|
+
""" Definition of sine generator
|
|
272
|
+
SineGen(samp_rate, harmonic_num = 0,
|
|
273
|
+
sine_amp = 0.1, noise_std = 0.003,
|
|
274
|
+
voiced_threshold = 0,
|
|
275
|
+
flag_for_pulse=False)
|
|
276
|
+
samp_rate: sampling rate in Hz
|
|
277
|
+
harmonic_num: number of harmonic overtones (default 0)
|
|
278
|
+
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
|
279
|
+
noise_std: std of Gaussian noise (default 0.003)
|
|
280
|
+
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
|
281
|
+
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
|
282
|
+
Note: when flag_for_pulse is True, the first time step of a voiced
|
|
283
|
+
segment is always sin(np.pi) or cos(0)
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
|
|
287
|
+
sine_amp=0.1, noise_std=0.003,
|
|
288
|
+
voiced_threshold=0,
|
|
289
|
+
flag_for_pulse=False):
|
|
290
|
+
super(SineGen2, self).__init__()
|
|
291
|
+
self.sine_amp = sine_amp
|
|
292
|
+
self.noise_std = noise_std
|
|
293
|
+
self.harmonic_num = harmonic_num
|
|
294
|
+
self.dim = self.harmonic_num + 1
|
|
295
|
+
self.sampling_rate = samp_rate
|
|
296
|
+
self.voiced_threshold = voiced_threshold
|
|
297
|
+
self.flag_for_pulse = flag_for_pulse
|
|
298
|
+
self.upsample_scale = upsample_scale
|
|
299
|
+
|
|
300
|
+
def _f02uv(self, f0):
|
|
301
|
+
# generate uv signal
|
|
302
|
+
uv = (f0 > self.voiced_threshold).type(torch.float32)
|
|
303
|
+
return uv
|
|
304
|
+
|
|
305
|
+
def _f02sine(self, f0_values):
|
|
306
|
+
""" f0_values: (batchsize, length, dim)
|
|
307
|
+
where dim indicates fundamental tone and overtones
|
|
308
|
+
"""
|
|
309
|
+
# convert to F0 in rad. The interger part n can be ignored
|
|
310
|
+
# because 2 * np.pi * n doesn't affect phase
|
|
311
|
+
rad_values = (f0_values / self.sampling_rate) % 1
|
|
312
|
+
|
|
313
|
+
# initial phase noise (no noise for fundamental component)
|
|
314
|
+
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
|
|
315
|
+
rand_ini[:, 0] = 0
|
|
316
|
+
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
|
317
|
+
|
|
318
|
+
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
|
|
319
|
+
if not self.flag_for_pulse:
|
|
320
|
+
rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
|
|
321
|
+
scale_factor=1 / self.upsample_scale,
|
|
322
|
+
mode="linear").transpose(1, 2)
|
|
323
|
+
|
|
324
|
+
phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
|
|
325
|
+
phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
|
|
326
|
+
scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
|
|
327
|
+
sines = torch.sin(phase)
|
|
328
|
+
else:
|
|
329
|
+
# If necessary, make sure that the first time step of every
|
|
330
|
+
# voiced segments is sin(pi) or cos(0)
|
|
331
|
+
# This is used for pulse-train generation
|
|
332
|
+
|
|
333
|
+
# identify the last time step in unvoiced segments
|
|
334
|
+
uv = self._f02uv(f0_values)
|
|
335
|
+
uv_1 = torch.roll(uv, shifts=-1, dims=1)
|
|
336
|
+
uv_1[:, -1, :] = 1
|
|
337
|
+
u_loc = (uv < 1) * (uv_1 > 0)
|
|
338
|
+
|
|
339
|
+
# get the instantanouse phase
|
|
340
|
+
tmp_cumsum = torch.cumsum(rad_values, dim=1)
|
|
341
|
+
# different batch needs to be processed differently
|
|
342
|
+
for idx in range(f0_values.shape[0]):
|
|
343
|
+
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
|
|
344
|
+
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
|
|
345
|
+
# stores the accumulation of i.phase within
|
|
346
|
+
# each voiced segments
|
|
347
|
+
tmp_cumsum[idx, :, :] = 0
|
|
348
|
+
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
|
|
349
|
+
|
|
350
|
+
# rad_values - tmp_cumsum: remove the accumulation of i.phase
|
|
351
|
+
# within the previous voiced segment.
|
|
352
|
+
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
|
|
353
|
+
|
|
354
|
+
# get the sines
|
|
355
|
+
sines = torch.cos(i_phase * 2 * np.pi)
|
|
356
|
+
return sines
|
|
357
|
+
|
|
358
|
+
def forward(self, f0):
|
|
359
|
+
""" sine_tensor, uv = forward(f0)
|
|
360
|
+
input F0: tensor(batchsize=1, length, dim=1)
|
|
361
|
+
f0 for unvoiced steps should be 0
|
|
362
|
+
output sine_tensor: tensor(batchsize=1, length, dim)
|
|
363
|
+
output uv: tensor(batchsize=1, length, 1)
|
|
364
|
+
"""
|
|
365
|
+
# fundamental component
|
|
366
|
+
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
|
367
|
+
|
|
368
|
+
# generate sine waveforms
|
|
369
|
+
sine_waves = self._f02sine(fn) * self.sine_amp
|
|
370
|
+
|
|
371
|
+
# generate uv signal
|
|
372
|
+
uv = self._f02uv(f0)
|
|
373
|
+
|
|
374
|
+
# noise: for unvoiced should be similar to sine_amp
|
|
375
|
+
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
|
376
|
+
# . for voiced regions is self.noise_std
|
|
377
|
+
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
|
378
|
+
noise = noise_amp * torch.randn_like(sine_waves)
|
|
379
|
+
|
|
380
|
+
# first: set the unvoiced part to 0 by uv
|
|
381
|
+
# then: additive noise
|
|
382
|
+
sine_waves = sine_waves * uv + noise
|
|
383
|
+
return sine_waves, uv, noise
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
class SourceModuleHnNSF2(torch.nn.Module):
|
|
387
|
+
""" SourceModule for hn-nsf
|
|
388
|
+
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
|
389
|
+
add_noise_std=0.003, voiced_threshod=0)
|
|
390
|
+
sampling_rate: sampling_rate in Hz
|
|
391
|
+
harmonic_num: number of harmonic above F0 (default: 0)
|
|
392
|
+
sine_amp: amplitude of sine source signal (default: 0.1)
|
|
393
|
+
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
|
394
|
+
note that amplitude of noise in unvoiced is decided
|
|
395
|
+
by sine_amp
|
|
396
|
+
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
|
397
|
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
|
398
|
+
F0_sampled (batchsize, length, 1)
|
|
399
|
+
Sine_source (batchsize, length, 1)
|
|
400
|
+
noise_source (batchsize, length 1)
|
|
401
|
+
uv (batchsize, length, 1)
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
|
|
405
|
+
add_noise_std=0.003, voiced_threshod=0):
|
|
406
|
+
super(SourceModuleHnNSF2, self).__init__()
|
|
407
|
+
|
|
408
|
+
self.sine_amp = sine_amp
|
|
409
|
+
self.noise_std = add_noise_std
|
|
410
|
+
|
|
411
|
+
# to produce sine waveforms
|
|
412
|
+
self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
|
|
413
|
+
sine_amp, add_noise_std, voiced_threshod)
|
|
414
|
+
|
|
415
|
+
# to merge source harmonics into a single excitation
|
|
416
|
+
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
|
417
|
+
self.l_tanh = torch.nn.Tanh()
|
|
418
|
+
|
|
419
|
+
def forward(self, x):
|
|
420
|
+
"""
|
|
421
|
+
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
|
422
|
+
F0_sampled (batchsize, length, 1)
|
|
423
|
+
Sine_source (batchsize, length, 1)
|
|
424
|
+
noise_source (batchsize, length 1)
|
|
425
|
+
"""
|
|
426
|
+
# source for harmonic branch
|
|
427
|
+
with torch.no_grad():
|
|
428
|
+
sine_wavs, uv, _ = self.l_sin_gen(x)
|
|
429
|
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
|
430
|
+
|
|
431
|
+
# source for noise branch, in the same shape as uv
|
|
432
|
+
noise = torch.randn_like(uv) * self.sine_amp / 3
|
|
433
|
+
return sine_merge, noise, uv
|