minicpmo-utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosyvoice/__init__.py +17 -0
- cosyvoice/bin/average_model.py +93 -0
- cosyvoice/bin/export_jit.py +103 -0
- cosyvoice/bin/export_onnx.py +120 -0
- cosyvoice/bin/inference_deprecated.py +126 -0
- cosyvoice/bin/train.py +195 -0
- cosyvoice/cli/__init__.py +0 -0
- cosyvoice/cli/cosyvoice.py +209 -0
- cosyvoice/cli/frontend.py +238 -0
- cosyvoice/cli/model.py +386 -0
- cosyvoice/dataset/__init__.py +0 -0
- cosyvoice/dataset/dataset.py +151 -0
- cosyvoice/dataset/processor.py +434 -0
- cosyvoice/flow/decoder.py +494 -0
- cosyvoice/flow/flow.py +281 -0
- cosyvoice/flow/flow_matching.py +227 -0
- cosyvoice/flow/length_regulator.py +70 -0
- cosyvoice/hifigan/discriminator.py +230 -0
- cosyvoice/hifigan/f0_predictor.py +58 -0
- cosyvoice/hifigan/generator.py +582 -0
- cosyvoice/hifigan/hifigan.py +67 -0
- cosyvoice/llm/llm.py +610 -0
- cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
- cosyvoice/tokenizer/tokenizer.py +279 -0
- cosyvoice/transformer/__init__.py +0 -0
- cosyvoice/transformer/activation.py +84 -0
- cosyvoice/transformer/attention.py +330 -0
- cosyvoice/transformer/convolution.py +145 -0
- cosyvoice/transformer/decoder.py +396 -0
- cosyvoice/transformer/decoder_layer.py +132 -0
- cosyvoice/transformer/embedding.py +302 -0
- cosyvoice/transformer/encoder.py +474 -0
- cosyvoice/transformer/encoder_layer.py +236 -0
- cosyvoice/transformer/label_smoothing_loss.py +96 -0
- cosyvoice/transformer/positionwise_feed_forward.py +115 -0
- cosyvoice/transformer/subsampling.py +383 -0
- cosyvoice/transformer/upsample_encoder.py +320 -0
- cosyvoice/utils/__init__.py +0 -0
- cosyvoice/utils/class_utils.py +83 -0
- cosyvoice/utils/common.py +186 -0
- cosyvoice/utils/executor.py +176 -0
- cosyvoice/utils/file_utils.py +129 -0
- cosyvoice/utils/frontend_utils.py +136 -0
- cosyvoice/utils/losses.py +57 -0
- cosyvoice/utils/mask.py +265 -0
- cosyvoice/utils/scheduler.py +738 -0
- cosyvoice/utils/train_utils.py +367 -0
- cosyvoice/vllm/cosyvoice2.py +103 -0
- matcha/__init__.py +0 -0
- matcha/app.py +357 -0
- matcha/cli.py +418 -0
- matcha/hifigan/__init__.py +0 -0
- matcha/hifigan/config.py +28 -0
- matcha/hifigan/denoiser.py +64 -0
- matcha/hifigan/env.py +17 -0
- matcha/hifigan/meldataset.py +217 -0
- matcha/hifigan/models.py +368 -0
- matcha/hifigan/xutils.py +60 -0
- matcha/models/__init__.py +0 -0
- matcha/models/baselightningmodule.py +209 -0
- matcha/models/components/__init__.py +0 -0
- matcha/models/components/decoder.py +443 -0
- matcha/models/components/flow_matching.py +132 -0
- matcha/models/components/text_encoder.py +410 -0
- matcha/models/components/transformer.py +316 -0
- matcha/models/matcha_tts.py +239 -0
- matcha/onnx/__init__.py +0 -0
- matcha/onnx/export.py +181 -0
- matcha/onnx/infer.py +168 -0
- matcha/text/__init__.py +53 -0
- matcha/text/cleaners.py +116 -0
- matcha/text/numbers.py +71 -0
- matcha/text/symbols.py +17 -0
- matcha/train.py +122 -0
- matcha/utils/__init__.py +5 -0
- matcha/utils/audio.py +82 -0
- matcha/utils/generate_data_statistics.py +111 -0
- matcha/utils/instantiators.py +56 -0
- matcha/utils/logging_utils.py +53 -0
- matcha/utils/model.py +90 -0
- matcha/utils/monotonic_align/__init__.py +22 -0
- matcha/utils/monotonic_align/setup.py +7 -0
- matcha/utils/pylogger.py +21 -0
- matcha/utils/rich_utils.py +101 -0
- matcha/utils/utils.py +219 -0
- minicpmo/__init__.py +24 -0
- minicpmo/utils.py +636 -0
- minicpmo/version.py +2 -0
- minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
- minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
- minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
- minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
- s3tokenizer/__init__.py +153 -0
- s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
- s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
- s3tokenizer/assets/mel_filters.npz +0 -0
- s3tokenizer/cli.py +183 -0
- s3tokenizer/model.py +546 -0
- s3tokenizer/model_v2.py +605 -0
- s3tokenizer/utils.py +390 -0
- stepaudio2/__init__.py +40 -0
- stepaudio2/cosyvoice2/__init__.py +1 -0
- stepaudio2/cosyvoice2/flow/__init__.py +0 -0
- stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
- stepaudio2/cosyvoice2/flow/flow.py +230 -0
- stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
- stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
- stepaudio2/cosyvoice2/transformer/attention.py +328 -0
- stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
- stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
- stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
- stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
- stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
- stepaudio2/cosyvoice2/utils/__init__.py +1 -0
- stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
- stepaudio2/cosyvoice2/utils/common.py +101 -0
- stepaudio2/cosyvoice2/utils/mask.py +49 -0
- stepaudio2/flashcosyvoice/__init__.py +0 -0
- stepaudio2/flashcosyvoice/cli.py +424 -0
- stepaudio2/flashcosyvoice/config.py +80 -0
- stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
- stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
- stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
- stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
- stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
- stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
- stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
- stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
- stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/flow.py +198 -0
- stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
- stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
- stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
- stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
- stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
- stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
- stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
- stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
- stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
- stepaudio2/flashcosyvoice/utils/audio.py +77 -0
- stepaudio2/flashcosyvoice/utils/context.py +28 -0
- stepaudio2/flashcosyvoice/utils/loader.py +116 -0
- stepaudio2/flashcosyvoice/utils/memory.py +19 -0
- stepaudio2/stepaudio2.py +204 -0
- stepaudio2/token2wav.py +248 -0
- stepaudio2/utils.py +91 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
|
|
2
|
+
# 2024 Alibaba Inc (Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# Modified from ESPnet(https://github.com/espnet/espnet)
|
|
16
|
+
"""Subsampling layer definition."""
|
|
17
|
+
|
|
18
|
+
from typing import Tuple, Union
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseSubsampling(torch.nn.Module):
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
self.right_context = 0
|
|
28
|
+
self.subsampling_rate = 1
|
|
29
|
+
|
|
30
|
+
def position_encoding(self, offset: Union[int, torch.Tensor],
|
|
31
|
+
size: int) -> torch.Tensor:
|
|
32
|
+
return self.pos_enc.position_encoding(offset, size)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LinearNoSubsampling(BaseSubsampling):
|
|
36
|
+
"""Linear transform the input without subsampling
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
idim (int): Input dimension.
|
|
40
|
+
odim (int): Output dimension.
|
|
41
|
+
dropout_rate (float): Dropout rate.
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
46
|
+
pos_enc_class: torch.nn.Module):
|
|
47
|
+
"""Construct an linear object."""
|
|
48
|
+
super().__init__()
|
|
49
|
+
self.out = torch.nn.Sequential(
|
|
50
|
+
torch.nn.Linear(idim, odim),
|
|
51
|
+
torch.nn.LayerNorm(odim, eps=1e-5),
|
|
52
|
+
torch.nn.Dropout(dropout_rate),
|
|
53
|
+
)
|
|
54
|
+
self.pos_enc = pos_enc_class
|
|
55
|
+
self.right_context = 0
|
|
56
|
+
self.subsampling_rate = 1
|
|
57
|
+
|
|
58
|
+
def forward(
|
|
59
|
+
self,
|
|
60
|
+
x: torch.Tensor,
|
|
61
|
+
x_mask: torch.Tensor,
|
|
62
|
+
offset: Union[int, torch.Tensor] = 0
|
|
63
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
64
|
+
"""Input x.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
68
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
torch.Tensor: linear input tensor (#batch, time', odim),
|
|
72
|
+
where time' = time .
|
|
73
|
+
torch.Tensor: linear input mask (#batch, 1, time'),
|
|
74
|
+
where time' = time .
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
x = self.out(x)
|
|
78
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
79
|
+
return x, pos_emb, x_mask
|
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
|
|
2
|
+
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
|
|
3
|
+
# 2024 Alibaba Inc (Xiang Lyu)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
# Modified from ESPnet(https://github.com/espnet/espnet)
|
|
17
|
+
"""Encoder definition."""
|
|
18
|
+
from typing import Tuple, List
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
from torch import nn
|
|
22
|
+
from torch.nn import functional as F
|
|
23
|
+
|
|
24
|
+
from stepaudio2.cosyvoice2.transformer.encoder_layer import ConformerEncoderLayer
|
|
25
|
+
from stepaudio2.cosyvoice2.transformer.positionwise_feed_forward import PositionwiseFeedForward
|
|
26
|
+
from stepaudio2.cosyvoice2.utils.class_utils import (
|
|
27
|
+
COSYVOICE_EMB_CLASSES,
|
|
28
|
+
COSYVOICE_SUBSAMPLE_CLASSES,
|
|
29
|
+
COSYVOICE_ATTENTION_CLASSES,
|
|
30
|
+
COSYVOICE_ACTIVATION_CLASSES,
|
|
31
|
+
)
|
|
32
|
+
from stepaudio2.cosyvoice2.utils.mask import (
|
|
33
|
+
make_pad_mask,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
import torch._dynamo
|
|
37
|
+
torch._dynamo.config.suppress_errors = True
|
|
38
|
+
torch._dynamo.config.cache_size_limit = 128
|
|
39
|
+
|
|
40
|
+
class Upsample1D(nn.Module):
|
|
41
|
+
"""A 1D upsampling layer with an optional convolution.
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
channels (`int`):
|
|
45
|
+
number of channels in the inputs and outputs.
|
|
46
|
+
use_conv (`bool`, default `False`):
|
|
47
|
+
option to use a convolution.
|
|
48
|
+
use_conv_transpose (`bool`, default `False`):
|
|
49
|
+
option to use a convolution transpose.
|
|
50
|
+
out_channels (`int`, optional):
|
|
51
|
+
number of output channels. Defaults to `channels`.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, channels: int, out_channels: int, stride: int = 2, scale_factor: float = None):
|
|
55
|
+
super().__init__()
|
|
56
|
+
self.channels = channels
|
|
57
|
+
self.out_channels = out_channels
|
|
58
|
+
self.stride = stride
|
|
59
|
+
# In this mode, first repeat interpolate, than conv with stride=1
|
|
60
|
+
self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
|
|
61
|
+
self.scale_factor = float(self.stride) if scale_factor is None else float(scale_factor)
|
|
62
|
+
|
|
63
|
+
def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
|
|
64
|
+
outputs = F.interpolate(inputs, scale_factor=self.scale_factor, mode="nearest")
|
|
65
|
+
outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
|
|
66
|
+
outputs = self.conv(outputs)
|
|
67
|
+
return outputs, input_lengths * self.stride
|
|
68
|
+
|
|
69
|
+
def forward_chunk(self, inputs: torch.Tensor, input_lengths: torch.Tensor, cache: torch.Tensor = torch.zeros((0, 0, 0))):
|
|
70
|
+
"""
|
|
71
|
+
Args:
|
|
72
|
+
inputs(torch.Tensor): shape (b, c, t)
|
|
73
|
+
input_length(torch.Tensor): shape (b), can be None
|
|
74
|
+
cache(torch.Tensor): shape (b, c, cache_t), where cache_t = stride * 2
|
|
75
|
+
"""
|
|
76
|
+
outputs = F.interpolate(inputs, scale_factor=self.scale_factor, mode="nearest")
|
|
77
|
+
|
|
78
|
+
if cache is None:
|
|
79
|
+
cache = inputs.new_zeros(inputs.shape[0], inputs.shape[1], self.stride * 2)
|
|
80
|
+
outputs = torch.cat([cache, outputs], dim=2)
|
|
81
|
+
new_cache = outputs[..., -self.stride*2:]
|
|
82
|
+
outputs = self.conv(outputs)
|
|
83
|
+
|
|
84
|
+
if input_lengths is not None:
|
|
85
|
+
input_lengths = input_lengths * self.stride
|
|
86
|
+
return outputs, input_lengths, new_cache
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class PreLookaheadLayer(nn.Module):
|
|
90
|
+
def __init__(self, channels: int, pre_lookahead_len: int = 1):
|
|
91
|
+
super().__init__()
|
|
92
|
+
self.channels = channels
|
|
93
|
+
self.pre_lookahead_len = pre_lookahead_len
|
|
94
|
+
self.conv1 = nn.Conv1d(
|
|
95
|
+
channels, channels,
|
|
96
|
+
kernel_size=pre_lookahead_len + 1,
|
|
97
|
+
stride=1, padding=0,
|
|
98
|
+
)
|
|
99
|
+
self.conv2 = nn.Conv1d(
|
|
100
|
+
channels, channels,
|
|
101
|
+
kernel_size=3, stride=1, padding=0,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
|
|
105
|
+
"""
|
|
106
|
+
inputs: (batch_size, seq_len, channels)
|
|
107
|
+
"""
|
|
108
|
+
outputs = inputs.transpose(1, 2).contiguous()
|
|
109
|
+
# look ahead
|
|
110
|
+
outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
|
|
111
|
+
outputs = F.leaky_relu(self.conv1(outputs))
|
|
112
|
+
# outputs
|
|
113
|
+
outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
|
|
114
|
+
outputs = self.conv2(outputs)
|
|
115
|
+
outputs = outputs.transpose(1, 2).contiguous()
|
|
116
|
+
|
|
117
|
+
# residual connection
|
|
118
|
+
outputs = outputs + inputs
|
|
119
|
+
return outputs
|
|
120
|
+
|
|
121
|
+
def forward_chunk(self, inputs: torch.Tensor, cache: torch.Tensor = None):
|
|
122
|
+
"""
|
|
123
|
+
Args:
|
|
124
|
+
inputs(torch.Tensor): shape (b, t, c)
|
|
125
|
+
cache(torch.Tensor): shape (b, c, cache_t=2), c = channels
|
|
126
|
+
"""
|
|
127
|
+
outputs = inputs.transpose(1, 2).contiguous()
|
|
128
|
+
outputs = F.leaky_relu(self.conv1(outputs))
|
|
129
|
+
# the length of outputs is input length - pre_lookahead_len
|
|
130
|
+
if cache is None:
|
|
131
|
+
cache = outputs.new_zeros(outputs.shape[0], outputs.shape[1], 2)
|
|
132
|
+
# NOTE
|
|
133
|
+
new_cache = outputs[..., -2:]
|
|
134
|
+
outputs = torch.cat([cache, outputs], dim=2)
|
|
135
|
+
outputs = self.conv2(outputs)
|
|
136
|
+
outputs = outputs.transpose(1, 2).contiguous()
|
|
137
|
+
# residual connection
|
|
138
|
+
outputs = outputs + inputs[:, :-self.pre_lookahead_len]
|
|
139
|
+
return outputs, new_cache
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
"""Customize each sample's chunk attention mask
|
|
143
|
+
"""
|
|
144
|
+
class UpsampleConformerEncoderV2(torch.nn.Module):
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
# input & output
|
|
149
|
+
input_size: int,
|
|
150
|
+
output_size: int = 256,
|
|
151
|
+
input_layer: str = "linear",
|
|
152
|
+
pre_lookahead_len: int = 3,
|
|
153
|
+
# size
|
|
154
|
+
num_blocks: int = 6,
|
|
155
|
+
num_up_blocks: int = 4,
|
|
156
|
+
# upsampling
|
|
157
|
+
up_stride: int = 2,
|
|
158
|
+
up_scale_factor: float = 2,
|
|
159
|
+
# attention
|
|
160
|
+
attention_heads: int = 4,
|
|
161
|
+
pos_enc_layer_type: str = "rel_pos_espnet",
|
|
162
|
+
selfattention_layer_type: str = "rel_selfattn",
|
|
163
|
+
key_bias: bool = True,
|
|
164
|
+
# mlp
|
|
165
|
+
linear_units: int = 2048,
|
|
166
|
+
# dropouts
|
|
167
|
+
dropout_rate: float = 0.1,
|
|
168
|
+
positional_dropout_rate: float = 0.1,
|
|
169
|
+
attention_dropout_rate: float = 0.0,
|
|
170
|
+
# other
|
|
171
|
+
normalize_before: bool = True,
|
|
172
|
+
activation_type: str = "swish",
|
|
173
|
+
**kwargs,
|
|
174
|
+
):
|
|
175
|
+
super().__init__()
|
|
176
|
+
self._output_size = output_size
|
|
177
|
+
self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
|
|
178
|
+
input_size,
|
|
179
|
+
output_size,
|
|
180
|
+
dropout_rate,
|
|
181
|
+
COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
|
|
182
|
+
output_size,
|
|
183
|
+
positional_dropout_rate
|
|
184
|
+
),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
self.normalize_before = normalize_before
|
|
188
|
+
self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
|
|
189
|
+
activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
|
|
190
|
+
# self-attention module definition
|
|
191
|
+
encoder_selfattn_layer_args = (
|
|
192
|
+
attention_heads,
|
|
193
|
+
output_size,
|
|
194
|
+
attention_dropout_rate,
|
|
195
|
+
key_bias,
|
|
196
|
+
)
|
|
197
|
+
# feed-forward module definition
|
|
198
|
+
positionwise_layer_args = (
|
|
199
|
+
output_size,
|
|
200
|
+
linear_units,
|
|
201
|
+
dropout_rate,
|
|
202
|
+
activation,
|
|
203
|
+
)
|
|
204
|
+
self.pre_lookahead_layer = PreLookaheadLayer(
|
|
205
|
+
channels=output_size,
|
|
206
|
+
pre_lookahead_len=pre_lookahead_len
|
|
207
|
+
)
|
|
208
|
+
self.encoders = torch.nn.ModuleList([
|
|
209
|
+
ConformerEncoderLayer(
|
|
210
|
+
output_size,
|
|
211
|
+
COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
|
|
212
|
+
*encoder_selfattn_layer_args
|
|
213
|
+
),
|
|
214
|
+
PositionwiseFeedForward(*positionwise_layer_args),
|
|
215
|
+
None,
|
|
216
|
+
None,
|
|
217
|
+
dropout_rate,
|
|
218
|
+
normalize_before,
|
|
219
|
+
) for _ in range(num_blocks)
|
|
220
|
+
])
|
|
221
|
+
self.up_layer = Upsample1D(
|
|
222
|
+
channels=output_size,
|
|
223
|
+
out_channels=output_size,
|
|
224
|
+
stride=up_stride,
|
|
225
|
+
scale_factor=up_scale_factor
|
|
226
|
+
)
|
|
227
|
+
self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
|
|
228
|
+
input_size,
|
|
229
|
+
output_size,
|
|
230
|
+
dropout_rate,
|
|
231
|
+
COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
|
|
232
|
+
output_size,
|
|
233
|
+
positional_dropout_rate
|
|
234
|
+
),
|
|
235
|
+
)
|
|
236
|
+
self.up_encoders = torch.nn.ModuleList([
|
|
237
|
+
ConformerEncoderLayer(
|
|
238
|
+
output_size,
|
|
239
|
+
COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
|
|
240
|
+
*encoder_selfattn_layer_args
|
|
241
|
+
),
|
|
242
|
+
PositionwiseFeedForward(*positionwise_layer_args),
|
|
243
|
+
None,
|
|
244
|
+
None,
|
|
245
|
+
dropout_rate,
|
|
246
|
+
normalize_before,
|
|
247
|
+
) for _ in range(num_up_blocks)
|
|
248
|
+
])
|
|
249
|
+
|
|
250
|
+
self.enable_cuda_graph = False
|
|
251
|
+
self.use_cuda_graph = False
|
|
252
|
+
self.graph_encoder = {}
|
|
253
|
+
self.graph_up_encoder = {}
|
|
254
|
+
self.inference_buffers_encoder = {}
|
|
255
|
+
self.inference_buffers_up_encoder = {}
|
|
256
|
+
self.max_static_time = 1500
|
|
257
|
+
|
|
258
|
+
# FIXME(sfy) revert hard-coded bfloat16
|
|
259
|
+
# this method is skipped in CausalMaskedDiffWithXvec.scatter_cuda_graph
|
|
260
|
+
def scatter_cuda_graph(self, enable_cuda_graph: bool):
|
|
261
|
+
self.enable_cuda_graph = enable_cuda_graph
|
|
262
|
+
if self.enable_cuda_graph:
|
|
263
|
+
self._init_cuda_graph()
|
|
264
|
+
|
|
265
|
+
def _init_cuda_graph(self):
|
|
266
|
+
"""初始化 CUDA Graph"""
|
|
267
|
+
|
|
268
|
+
for l in range(100, 1500, 10):
|
|
269
|
+
static_x = torch.zeros((1, l, 512),
|
|
270
|
+
dtype=torch.float32, device=torch.device('cuda'))
|
|
271
|
+
static_mask = torch.ones((1, 1, l),
|
|
272
|
+
dtype=torch.bool, device=torch.device('cuda'))
|
|
273
|
+
static_pos_emb = torch.zeros((1, 2*l-1, 512),
|
|
274
|
+
dtype=torch.float32, device=torch.device('cuda'))
|
|
275
|
+
|
|
276
|
+
static_inputs = [
|
|
277
|
+
static_x,
|
|
278
|
+
static_mask,
|
|
279
|
+
static_pos_emb,
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
self._forward_impl_encoder(
|
|
283
|
+
static_inputs[0],
|
|
284
|
+
static_inputs[1],
|
|
285
|
+
static_inputs[2],
|
|
286
|
+
)
|
|
287
|
+
graph = torch.cuda.CUDAGraph()
|
|
288
|
+
with torch.no_grad():
|
|
289
|
+
with torch.cuda.graph(graph):
|
|
290
|
+
static_out_x = self._forward_impl_encoder(
|
|
291
|
+
static_inputs[0],
|
|
292
|
+
static_inputs[1],
|
|
293
|
+
static_inputs[2]
|
|
294
|
+
)
|
|
295
|
+
self.graph_encoder[l] = graph
|
|
296
|
+
static_outputs = [
|
|
297
|
+
static_out_x,
|
|
298
|
+
]
|
|
299
|
+
self.inference_buffers_encoder[l] = {
|
|
300
|
+
'static_inputs': static_inputs,
|
|
301
|
+
'static_outputs': static_outputs
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
for l in range(100, 1500, 10):
|
|
305
|
+
static_x = torch.zeros((1, l, 512),
|
|
306
|
+
dtype=torch.float32, device=torch.device('cuda'))
|
|
307
|
+
static_mask = torch.ones((1, 1, l),
|
|
308
|
+
dtype=torch.bool, device=torch.device('cuda'))
|
|
309
|
+
static_pos_emb = torch.zeros((1, 2*l-1, 512),
|
|
310
|
+
dtype=torch.float32, device=torch.device('cuda'))
|
|
311
|
+
|
|
312
|
+
static_inputs = [
|
|
313
|
+
static_x,
|
|
314
|
+
static_mask,
|
|
315
|
+
static_pos_emb,
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
self._forward_impl_up_encoder(
|
|
319
|
+
static_inputs[0],
|
|
320
|
+
static_inputs[1],
|
|
321
|
+
static_inputs[2],
|
|
322
|
+
)
|
|
323
|
+
graph = torch.cuda.CUDAGraph()
|
|
324
|
+
with torch.no_grad():
|
|
325
|
+
with torch.cuda.graph(graph):
|
|
326
|
+
static_out_x = self._forward_impl_up_encoder(
|
|
327
|
+
static_inputs[0],
|
|
328
|
+
static_inputs[1],
|
|
329
|
+
static_inputs[2]
|
|
330
|
+
)
|
|
331
|
+
self.graph_up_encoder[l] = graph
|
|
332
|
+
static_outputs = [
|
|
333
|
+
static_out_x,
|
|
334
|
+
]
|
|
335
|
+
self.inference_buffers_up_encoder[l] = {
|
|
336
|
+
'static_inputs': static_inputs,
|
|
337
|
+
'static_outputs': static_outputs
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
self.use_cuda_graph = True
|
|
341
|
+
print("CUDA Graph initialized successfully for encoder and up_encoder")
|
|
342
|
+
|
|
343
|
+
# @torch.compile(dynamic=True,backend="eager")
|
|
344
|
+
def _forward_impl_encoder(self,
|
|
345
|
+
x: torch.Tensor,
|
|
346
|
+
mask: torch.Tensor,
|
|
347
|
+
pos_emb: torch.Tensor):
|
|
348
|
+
for layer in self.encoders:
|
|
349
|
+
x, _, _, _ = layer(x, mask, pos_emb)
|
|
350
|
+
return x
|
|
351
|
+
|
|
352
|
+
# @torch.compile(dynamic=True,backend="eager")
|
|
353
|
+
def _forward_impl_up_encoder(self,
|
|
354
|
+
x: torch.Tensor,
|
|
355
|
+
mask: torch.Tensor,
|
|
356
|
+
pos_emb: torch.Tensor):
|
|
357
|
+
for layer in self.up_encoders:
|
|
358
|
+
x, _, _, _ = layer(x, mask, pos_emb)
|
|
359
|
+
return x
|
|
360
|
+
|
|
361
|
+
def output_size(self) -> int:
|
|
362
|
+
return self._output_size
|
|
363
|
+
|
|
364
|
+
# @torch.compile(dynamic=True,backend="eager")
|
|
365
|
+
def forward(
|
|
366
|
+
self,
|
|
367
|
+
xs: torch.Tensor,
|
|
368
|
+
xs_lens: torch.Tensor,
|
|
369
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
370
|
+
# (sfy) chunk training strategy should not be open-sourced
|
|
371
|
+
T = xs.size(1)
|
|
372
|
+
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
|
|
373
|
+
xs, pos_emb, masks = self.embed(xs, masks)
|
|
374
|
+
|
|
375
|
+
# lookahead
|
|
376
|
+
xs = self.pre_lookahead_layer(xs)
|
|
377
|
+
# conformer block
|
|
378
|
+
if self.enable_cuda_graph and xs.shape[1] in self.graph_encoder:
|
|
379
|
+
self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][0].copy_(xs)
|
|
380
|
+
self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][1].copy_(masks)
|
|
381
|
+
self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][2].copy_(pos_emb)
|
|
382
|
+
self.graph_encoder[xs.shape[1]].replay()
|
|
383
|
+
xs = self.inference_buffers_encoder[xs.shape[1]]['static_outputs'][0]
|
|
384
|
+
else:
|
|
385
|
+
xs = self._forward_impl_encoder(xs, masks, pos_emb)
|
|
386
|
+
# upsample
|
|
387
|
+
xs = xs.transpose(1, 2).contiguous()
|
|
388
|
+
xs, xs_lens = self.up_layer(xs, xs_lens)
|
|
389
|
+
xs = xs.transpose(1, 2).contiguous()
|
|
390
|
+
|
|
391
|
+
# 2nd conformer block
|
|
392
|
+
T = xs.size(1)
|
|
393
|
+
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
|
|
394
|
+
xs, pos_emb, masks = self.up_embed(xs, masks)
|
|
395
|
+
if self.enable_cuda_graph and xs.shape[1] in self.graph_up_encoder:
|
|
396
|
+
self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][0].copy_(xs)
|
|
397
|
+
self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][1].copy_(masks)
|
|
398
|
+
self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][2].copy_(pos_emb)
|
|
399
|
+
self.graph_up_encoder[xs.shape[1]].replay()
|
|
400
|
+
xs = self.inference_buffers_up_encoder[xs.shape[1]]['static_outputs'][0]
|
|
401
|
+
else:
|
|
402
|
+
xs = self._forward_impl_up_encoder(xs, masks, pos_emb)
|
|
403
|
+
# post norm
|
|
404
|
+
if self.normalize_before:
|
|
405
|
+
xs = self.after_norm(xs)
|
|
406
|
+
return xs, masks
|
|
407
|
+
|
|
408
|
+
# @torch.compile(dynamic=True,backend="eager")
|
|
409
|
+
def forward_chunk(self,
|
|
410
|
+
xs: torch.Tensor,
|
|
411
|
+
last_chunk: bool = False,
|
|
412
|
+
cnn_cache: torch.Tensor = None,
|
|
413
|
+
att_cache: torch.Tensor = None,
|
|
414
|
+
):
|
|
415
|
+
"""
|
|
416
|
+
Args:
|
|
417
|
+
xs: shape (b, dt, c)
|
|
418
|
+
last_chunk: bool. If last chunk, will pad input with lookaheads
|
|
419
|
+
att_cache: shape (depth1+depth2, b, nh, 2*t1, c).
|
|
420
|
+
cnn_cache: shape (b, c, t1+t2). Where t1=2 (pre_lookahead_layer), t2=4 (up_layer)
|
|
421
|
+
"""
|
|
422
|
+
if att_cache is not None:
|
|
423
|
+
assert att_cache.shape[3] % 2 == 0, att_cache.shape
|
|
424
|
+
if cnn_cache is not None:
|
|
425
|
+
assert cnn_cache.shape[2] == 2+self.up_layer.stride*2, cnn_cache.shape
|
|
426
|
+
|
|
427
|
+
# unpack caches
|
|
428
|
+
offset1 = att_cache.shape[3] // 2 if att_cache is not None else 0
|
|
429
|
+
att_cache1 = att_cache[:len(self.encoders), :, :, :offset1] if att_cache is not None else [None] * len(self.encoders)
|
|
430
|
+
att_cache2 = att_cache[len(self.encoders):] if att_cache is not None else [None] * len(self.encoders)
|
|
431
|
+
cnn_cache1 = cnn_cache[:, :, :2] if cnn_cache is not None else None
|
|
432
|
+
cnn_cache2 = cnn_cache[:, :, 2:] if cnn_cache is not None else None
|
|
433
|
+
xs, _, _ = self.embed(xs, None)
|
|
434
|
+
if last_chunk:
|
|
435
|
+
xs = F.pad(xs, (0, 0, 0, self.pre_lookahead_layer.pre_lookahead_len))
|
|
436
|
+
|
|
437
|
+
# this_cnn_cache: shape (b=1, c=512, t=2)
|
|
438
|
+
xs, new_cnn_cache1 = self.pre_lookahead_layer.forward_chunk(xs, cache=cnn_cache1)
|
|
439
|
+
|
|
440
|
+
# remake pos_emb, offset param is ignored by position_encoding
|
|
441
|
+
pos_emb = self.embed.position_encoding(offset=None, size=offset1 + xs.shape[1])
|
|
442
|
+
|
|
443
|
+
# first conformer
|
|
444
|
+
chunk_masks = torch.zeros((0, 0, 0))
|
|
445
|
+
new_att_cache1 = []
|
|
446
|
+
|
|
447
|
+
for idx, layer in enumerate(self.encoders):
|
|
448
|
+
# this_att_cache: shape (b, nh, t, c * 2)
|
|
449
|
+
xs, _, this_new_att_cache1, _ = layer(xs, chunk_masks, pos_emb, att_cache=att_cache1[idx])
|
|
450
|
+
new_att_cache1.append(this_new_att_cache1)
|
|
451
|
+
new_att_cache1 = torch.stack(new_att_cache1, dim=0)
|
|
452
|
+
|
|
453
|
+
# upsample + conformer encoder, xs: (b, t, c) -> (b, c, t)
|
|
454
|
+
xs = xs.transpose(1, 2).contiguous()
|
|
455
|
+
# this_cnn_cache: shape (b=1, c=512, t=2*2)
|
|
456
|
+
xs, _, new_cnn_cache2 = self.up_layer.forward_chunk(xs, None, cache=cnn_cache2)
|
|
457
|
+
xs = xs.transpose(1, 2).contiguous()
|
|
458
|
+
|
|
459
|
+
# at this time, xs are doubled in length
|
|
460
|
+
xs, _, _ = self.up_embed(xs, None)
|
|
461
|
+
|
|
462
|
+
# remake pos_emb
|
|
463
|
+
pos_emb = self.embed.position_encoding(offset=None, size=offset1 * self.up_layer.stride + xs.shape[1])
|
|
464
|
+
|
|
465
|
+
# second conformer
|
|
466
|
+
chunk_masks = torch.zeros((0, 0, 0),dtype=torch.bfloat16)
|
|
467
|
+
new_att_cache2 = []
|
|
468
|
+
|
|
469
|
+
for idx, layer in enumerate(self.up_encoders):
|
|
470
|
+
xs, _, this_new_att_cache2, _ = layer(xs, chunk_masks, pos_emb, att_cache=att_cache2[idx])
|
|
471
|
+
new_att_cache2.append(this_new_att_cache2)
|
|
472
|
+
new_att_cache2 = torch.stack(new_att_cache2, dim=0)
|
|
473
|
+
|
|
474
|
+
if self.normalize_before:
|
|
475
|
+
xs = self.after_norm(xs)
|
|
476
|
+
|
|
477
|
+
# pack new cache
|
|
478
|
+
new_att_cache = torch.cat([new_att_cache1.repeat(1, 1, 1, 2, 1), new_att_cache2], dim=0)
|
|
479
|
+
new_cnn_cache = torch.cat([new_cnn_cache1, new_cnn_cache2], dim=2)
|
|
480
|
+
|
|
481
|
+
return xs, new_cnn_cache, new_att_cache
|
|
482
|
+
|
|
483
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CosyVoice2 utils subpackage."""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
|
|
2
|
+
# 2024 Alibaba Inc (authors: Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import torch
|
|
16
|
+
|
|
17
|
+
from stepaudio2.cosyvoice2.transformer.subsampling import LinearNoSubsampling
|
|
18
|
+
from stepaudio2.cosyvoice2.transformer.attention import RelPositionMultiHeadedAttention
|
|
19
|
+
from stepaudio2.cosyvoice2.transformer.embedding import EspnetRelPositionalEncoding
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
COSYVOICE_ACTIVATION_CLASSES = {
|
|
23
|
+
"hardtanh": torch.nn.Hardtanh,
|
|
24
|
+
"tanh": torch.nn.Tanh,
|
|
25
|
+
"relu": torch.nn.ReLU,
|
|
26
|
+
"selu": torch.nn.SELU,
|
|
27
|
+
"swish": torch.nn.SiLU,
|
|
28
|
+
"gelu": torch.nn.GELU,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
COSYVOICE_SUBSAMPLE_CLASSES = {
|
|
32
|
+
"linear": LinearNoSubsampling,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
COSYVOICE_EMB_CLASSES = {
|
|
36
|
+
"rel_pos_espnet": EspnetRelPositionalEncoding,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
COSYVOICE_ATTENTION_CLASSES = {
|
|
40
|
+
"rel_selfattn": RelPositionMultiHeadedAttention,
|
|
41
|
+
}
|