minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. cosyvoice/__init__.py +17 -0
  2. cosyvoice/bin/average_model.py +93 -0
  3. cosyvoice/bin/export_jit.py +103 -0
  4. cosyvoice/bin/export_onnx.py +120 -0
  5. cosyvoice/bin/inference_deprecated.py +126 -0
  6. cosyvoice/bin/train.py +195 -0
  7. cosyvoice/cli/__init__.py +0 -0
  8. cosyvoice/cli/cosyvoice.py +209 -0
  9. cosyvoice/cli/frontend.py +238 -0
  10. cosyvoice/cli/model.py +386 -0
  11. cosyvoice/dataset/__init__.py +0 -0
  12. cosyvoice/dataset/dataset.py +151 -0
  13. cosyvoice/dataset/processor.py +434 -0
  14. cosyvoice/flow/decoder.py +494 -0
  15. cosyvoice/flow/flow.py +281 -0
  16. cosyvoice/flow/flow_matching.py +227 -0
  17. cosyvoice/flow/length_regulator.py +70 -0
  18. cosyvoice/hifigan/discriminator.py +230 -0
  19. cosyvoice/hifigan/f0_predictor.py +58 -0
  20. cosyvoice/hifigan/generator.py +582 -0
  21. cosyvoice/hifigan/hifigan.py +67 -0
  22. cosyvoice/llm/llm.py +610 -0
  23. cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  24. cosyvoice/tokenizer/tokenizer.py +279 -0
  25. cosyvoice/transformer/__init__.py +0 -0
  26. cosyvoice/transformer/activation.py +84 -0
  27. cosyvoice/transformer/attention.py +330 -0
  28. cosyvoice/transformer/convolution.py +145 -0
  29. cosyvoice/transformer/decoder.py +396 -0
  30. cosyvoice/transformer/decoder_layer.py +132 -0
  31. cosyvoice/transformer/embedding.py +302 -0
  32. cosyvoice/transformer/encoder.py +474 -0
  33. cosyvoice/transformer/encoder_layer.py +236 -0
  34. cosyvoice/transformer/label_smoothing_loss.py +96 -0
  35. cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  36. cosyvoice/transformer/subsampling.py +383 -0
  37. cosyvoice/transformer/upsample_encoder.py +320 -0
  38. cosyvoice/utils/__init__.py +0 -0
  39. cosyvoice/utils/class_utils.py +83 -0
  40. cosyvoice/utils/common.py +186 -0
  41. cosyvoice/utils/executor.py +176 -0
  42. cosyvoice/utils/file_utils.py +129 -0
  43. cosyvoice/utils/frontend_utils.py +136 -0
  44. cosyvoice/utils/losses.py +57 -0
  45. cosyvoice/utils/mask.py +265 -0
  46. cosyvoice/utils/scheduler.py +738 -0
  47. cosyvoice/utils/train_utils.py +367 -0
  48. cosyvoice/vllm/cosyvoice2.py +103 -0
  49. matcha/__init__.py +0 -0
  50. matcha/app.py +357 -0
  51. matcha/cli.py +418 -0
  52. matcha/hifigan/__init__.py +0 -0
  53. matcha/hifigan/config.py +28 -0
  54. matcha/hifigan/denoiser.py +64 -0
  55. matcha/hifigan/env.py +17 -0
  56. matcha/hifigan/meldataset.py +217 -0
  57. matcha/hifigan/models.py +368 -0
  58. matcha/hifigan/xutils.py +60 -0
  59. matcha/models/__init__.py +0 -0
  60. matcha/models/baselightningmodule.py +209 -0
  61. matcha/models/components/__init__.py +0 -0
  62. matcha/models/components/decoder.py +443 -0
  63. matcha/models/components/flow_matching.py +132 -0
  64. matcha/models/components/text_encoder.py +410 -0
  65. matcha/models/components/transformer.py +316 -0
  66. matcha/models/matcha_tts.py +239 -0
  67. matcha/onnx/__init__.py +0 -0
  68. matcha/onnx/export.py +181 -0
  69. matcha/onnx/infer.py +168 -0
  70. matcha/text/__init__.py +53 -0
  71. matcha/text/cleaners.py +116 -0
  72. matcha/text/numbers.py +71 -0
  73. matcha/text/symbols.py +17 -0
  74. matcha/train.py +122 -0
  75. matcha/utils/__init__.py +5 -0
  76. matcha/utils/audio.py +82 -0
  77. matcha/utils/generate_data_statistics.py +111 -0
  78. matcha/utils/instantiators.py +56 -0
  79. matcha/utils/logging_utils.py +53 -0
  80. matcha/utils/model.py +90 -0
  81. matcha/utils/monotonic_align/__init__.py +22 -0
  82. matcha/utils/monotonic_align/setup.py +7 -0
  83. matcha/utils/pylogger.py +21 -0
  84. matcha/utils/rich_utils.py +101 -0
  85. matcha/utils/utils.py +219 -0
  86. minicpmo/__init__.py +24 -0
  87. minicpmo/utils.py +636 -0
  88. minicpmo/version.py +2 -0
  89. minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
  90. minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
  91. minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
  92. minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
  93. s3tokenizer/__init__.py +153 -0
  94. s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
  95. s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
  96. s3tokenizer/assets/mel_filters.npz +0 -0
  97. s3tokenizer/cli.py +183 -0
  98. s3tokenizer/model.py +546 -0
  99. s3tokenizer/model_v2.py +605 -0
  100. s3tokenizer/utils.py +390 -0
  101. stepaudio2/__init__.py +40 -0
  102. stepaudio2/cosyvoice2/__init__.py +1 -0
  103. stepaudio2/cosyvoice2/flow/__init__.py +0 -0
  104. stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
  105. stepaudio2/cosyvoice2/flow/flow.py +230 -0
  106. stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
  107. stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
  108. stepaudio2/cosyvoice2/transformer/attention.py +328 -0
  109. stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
  110. stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
  111. stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
  112. stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
  113. stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
  114. stepaudio2/cosyvoice2/utils/__init__.py +1 -0
  115. stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
  116. stepaudio2/cosyvoice2/utils/common.py +101 -0
  117. stepaudio2/cosyvoice2/utils/mask.py +49 -0
  118. stepaudio2/flashcosyvoice/__init__.py +0 -0
  119. stepaudio2/flashcosyvoice/cli.py +424 -0
  120. stepaudio2/flashcosyvoice/config.py +80 -0
  121. stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
  122. stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
  123. stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
  124. stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
  125. stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
  126. stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
  127. stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
  128. stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
  129. stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
  130. stepaudio2/flashcosyvoice/modules/flow.py +198 -0
  131. stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
  132. stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
  133. stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
  134. stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
  135. stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
  136. stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
  137. stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
  138. stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
  139. stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
  140. stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
  141. stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
  142. stepaudio2/flashcosyvoice/utils/audio.py +77 -0
  143. stepaudio2/flashcosyvoice/utils/context.py +28 -0
  144. stepaudio2/flashcosyvoice/utils/loader.py +116 -0
  145. stepaudio2/flashcosyvoice/utils/memory.py +19 -0
  146. stepaudio2/stepaudio2.py +204 -0
  147. stepaudio2/token2wav.py +248 -0
  148. stepaudio2/utils.py +91 -0
@@ -0,0 +1,79 @@
1
+ # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
2
+ # 2024 Alibaba Inc (Xiang Lyu)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # Modified from ESPnet(https://github.com/espnet/espnet)
16
+ """Subsampling layer definition."""
17
+
18
+ from typing import Tuple, Union
19
+
20
+ import torch
21
+
22
+
23
+ class BaseSubsampling(torch.nn.Module):
24
+
25
+ def __init__(self):
26
+ super().__init__()
27
+ self.right_context = 0
28
+ self.subsampling_rate = 1
29
+
30
+ def position_encoding(self, offset: Union[int, torch.Tensor],
31
+ size: int) -> torch.Tensor:
32
+ return self.pos_enc.position_encoding(offset, size)
33
+
34
+
35
+ class LinearNoSubsampling(BaseSubsampling):
36
+ """Linear transform the input without subsampling
37
+
38
+ Args:
39
+ idim (int): Input dimension.
40
+ odim (int): Output dimension.
41
+ dropout_rate (float): Dropout rate.
42
+
43
+ """
44
+
45
+ def __init__(self, idim: int, odim: int, dropout_rate: float,
46
+ pos_enc_class: torch.nn.Module):
47
+ """Construct an linear object."""
48
+ super().__init__()
49
+ self.out = torch.nn.Sequential(
50
+ torch.nn.Linear(idim, odim),
51
+ torch.nn.LayerNorm(odim, eps=1e-5),
52
+ torch.nn.Dropout(dropout_rate),
53
+ )
54
+ self.pos_enc = pos_enc_class
55
+ self.right_context = 0
56
+ self.subsampling_rate = 1
57
+
58
+ def forward(
59
+ self,
60
+ x: torch.Tensor,
61
+ x_mask: torch.Tensor,
62
+ offset: Union[int, torch.Tensor] = 0
63
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
64
+ """Input x.
65
+
66
+ Args:
67
+ x (torch.Tensor): Input tensor (#batch, time, idim).
68
+ x_mask (torch.Tensor): Input mask (#batch, 1, time).
69
+
70
+ Returns:
71
+ torch.Tensor: linear input tensor (#batch, time', odim),
72
+ where time' = time .
73
+ torch.Tensor: linear input mask (#batch, 1, time'),
74
+ where time' = time .
75
+
76
+ """
77
+ x = self.out(x)
78
+ x, pos_emb = self.pos_enc(x, offset)
79
+ return x, pos_emb, x_mask
@@ -0,0 +1,483 @@
1
+ # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
2
+ # 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
3
+ # 2024 Alibaba Inc (Xiang Lyu)
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # Modified from ESPnet(https://github.com/espnet/espnet)
17
+ """Encoder definition."""
18
+ from typing import Tuple, List
19
+
20
+ import torch
21
+ from torch import nn
22
+ from torch.nn import functional as F
23
+
24
+ from stepaudio2.cosyvoice2.transformer.encoder_layer import ConformerEncoderLayer
25
+ from stepaudio2.cosyvoice2.transformer.positionwise_feed_forward import PositionwiseFeedForward
26
+ from stepaudio2.cosyvoice2.utils.class_utils import (
27
+ COSYVOICE_EMB_CLASSES,
28
+ COSYVOICE_SUBSAMPLE_CLASSES,
29
+ COSYVOICE_ATTENTION_CLASSES,
30
+ COSYVOICE_ACTIVATION_CLASSES,
31
+ )
32
+ from stepaudio2.cosyvoice2.utils.mask import (
33
+ make_pad_mask,
34
+ )
35
+
36
+ import torch._dynamo
37
+ torch._dynamo.config.suppress_errors = True
38
+ torch._dynamo.config.cache_size_limit = 128
39
+
40
+ class Upsample1D(nn.Module):
41
+ """A 1D upsampling layer with an optional convolution.
42
+
43
+ Parameters:
44
+ channels (`int`):
45
+ number of channels in the inputs and outputs.
46
+ use_conv (`bool`, default `False`):
47
+ option to use a convolution.
48
+ use_conv_transpose (`bool`, default `False`):
49
+ option to use a convolution transpose.
50
+ out_channels (`int`, optional):
51
+ number of output channels. Defaults to `channels`.
52
+ """
53
+
54
+ def __init__(self, channels: int, out_channels: int, stride: int = 2, scale_factor: float = None):
55
+ super().__init__()
56
+ self.channels = channels
57
+ self.out_channels = out_channels
58
+ self.stride = stride
59
+ # In this mode, first repeat interpolate, than conv with stride=1
60
+ self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
61
+ self.scale_factor = float(self.stride) if scale_factor is None else float(scale_factor)
62
+
63
+ def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
64
+ outputs = F.interpolate(inputs, scale_factor=self.scale_factor, mode="nearest")
65
+ outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
66
+ outputs = self.conv(outputs)
67
+ return outputs, input_lengths * self.stride
68
+
69
+ def forward_chunk(self, inputs: torch.Tensor, input_lengths: torch.Tensor, cache: torch.Tensor = torch.zeros((0, 0, 0))):
70
+ """
71
+ Args:
72
+ inputs(torch.Tensor): shape (b, c, t)
73
+ input_length(torch.Tensor): shape (b), can be None
74
+ cache(torch.Tensor): shape (b, c, cache_t), where cache_t = stride * 2
75
+ """
76
+ outputs = F.interpolate(inputs, scale_factor=self.scale_factor, mode="nearest")
77
+
78
+ if cache is None:
79
+ cache = inputs.new_zeros(inputs.shape[0], inputs.shape[1], self.stride * 2)
80
+ outputs = torch.cat([cache, outputs], dim=2)
81
+ new_cache = outputs[..., -self.stride*2:]
82
+ outputs = self.conv(outputs)
83
+
84
+ if input_lengths is not None:
85
+ input_lengths = input_lengths * self.stride
86
+ return outputs, input_lengths, new_cache
87
+
88
+
89
+ class PreLookaheadLayer(nn.Module):
90
+ def __init__(self, channels: int, pre_lookahead_len: int = 1):
91
+ super().__init__()
92
+ self.channels = channels
93
+ self.pre_lookahead_len = pre_lookahead_len
94
+ self.conv1 = nn.Conv1d(
95
+ channels, channels,
96
+ kernel_size=pre_lookahead_len + 1,
97
+ stride=1, padding=0,
98
+ )
99
+ self.conv2 = nn.Conv1d(
100
+ channels, channels,
101
+ kernel_size=3, stride=1, padding=0,
102
+ )
103
+
104
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
105
+ """
106
+ inputs: (batch_size, seq_len, channels)
107
+ """
108
+ outputs = inputs.transpose(1, 2).contiguous()
109
+ # look ahead
110
+ outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
111
+ outputs = F.leaky_relu(self.conv1(outputs))
112
+ # outputs
113
+ outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
114
+ outputs = self.conv2(outputs)
115
+ outputs = outputs.transpose(1, 2).contiguous()
116
+
117
+ # residual connection
118
+ outputs = outputs + inputs
119
+ return outputs
120
+
121
+ def forward_chunk(self, inputs: torch.Tensor, cache: torch.Tensor = None):
122
+ """
123
+ Args:
124
+ inputs(torch.Tensor): shape (b, t, c)
125
+ cache(torch.Tensor): shape (b, c, cache_t=2), c = channels
126
+ """
127
+ outputs = inputs.transpose(1, 2).contiguous()
128
+ outputs = F.leaky_relu(self.conv1(outputs))
129
+ # the length of outputs is input length - pre_lookahead_len
130
+ if cache is None:
131
+ cache = outputs.new_zeros(outputs.shape[0], outputs.shape[1], 2)
132
+ # NOTE
133
+ new_cache = outputs[..., -2:]
134
+ outputs = torch.cat([cache, outputs], dim=2)
135
+ outputs = self.conv2(outputs)
136
+ outputs = outputs.transpose(1, 2).contiguous()
137
+ # residual connection
138
+ outputs = outputs + inputs[:, :-self.pre_lookahead_len]
139
+ return outputs, new_cache
140
+
141
+
142
+ """Customize each sample's chunk attention mask
143
+ """
144
+ class UpsampleConformerEncoderV2(torch.nn.Module):
145
+
146
+ def __init__(
147
+ self,
148
+ # input & output
149
+ input_size: int,
150
+ output_size: int = 256,
151
+ input_layer: str = "linear",
152
+ pre_lookahead_len: int = 3,
153
+ # size
154
+ num_blocks: int = 6,
155
+ num_up_blocks: int = 4,
156
+ # upsampling
157
+ up_stride: int = 2,
158
+ up_scale_factor: float = 2,
159
+ # attention
160
+ attention_heads: int = 4,
161
+ pos_enc_layer_type: str = "rel_pos_espnet",
162
+ selfattention_layer_type: str = "rel_selfattn",
163
+ key_bias: bool = True,
164
+ # mlp
165
+ linear_units: int = 2048,
166
+ # dropouts
167
+ dropout_rate: float = 0.1,
168
+ positional_dropout_rate: float = 0.1,
169
+ attention_dropout_rate: float = 0.0,
170
+ # other
171
+ normalize_before: bool = True,
172
+ activation_type: str = "swish",
173
+ **kwargs,
174
+ ):
175
+ super().__init__()
176
+ self._output_size = output_size
177
+ self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
178
+ input_size,
179
+ output_size,
180
+ dropout_rate,
181
+ COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
182
+ output_size,
183
+ positional_dropout_rate
184
+ ),
185
+ )
186
+
187
+ self.normalize_before = normalize_before
188
+ self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
189
+ activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
190
+ # self-attention module definition
191
+ encoder_selfattn_layer_args = (
192
+ attention_heads,
193
+ output_size,
194
+ attention_dropout_rate,
195
+ key_bias,
196
+ )
197
+ # feed-forward module definition
198
+ positionwise_layer_args = (
199
+ output_size,
200
+ linear_units,
201
+ dropout_rate,
202
+ activation,
203
+ )
204
+ self.pre_lookahead_layer = PreLookaheadLayer(
205
+ channels=output_size,
206
+ pre_lookahead_len=pre_lookahead_len
207
+ )
208
+ self.encoders = torch.nn.ModuleList([
209
+ ConformerEncoderLayer(
210
+ output_size,
211
+ COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
212
+ *encoder_selfattn_layer_args
213
+ ),
214
+ PositionwiseFeedForward(*positionwise_layer_args),
215
+ None,
216
+ None,
217
+ dropout_rate,
218
+ normalize_before,
219
+ ) for _ in range(num_blocks)
220
+ ])
221
+ self.up_layer = Upsample1D(
222
+ channels=output_size,
223
+ out_channels=output_size,
224
+ stride=up_stride,
225
+ scale_factor=up_scale_factor
226
+ )
227
+ self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
228
+ input_size,
229
+ output_size,
230
+ dropout_rate,
231
+ COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
232
+ output_size,
233
+ positional_dropout_rate
234
+ ),
235
+ )
236
+ self.up_encoders = torch.nn.ModuleList([
237
+ ConformerEncoderLayer(
238
+ output_size,
239
+ COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
240
+ *encoder_selfattn_layer_args
241
+ ),
242
+ PositionwiseFeedForward(*positionwise_layer_args),
243
+ None,
244
+ None,
245
+ dropout_rate,
246
+ normalize_before,
247
+ ) for _ in range(num_up_blocks)
248
+ ])
249
+
250
+ self.enable_cuda_graph = False
251
+ self.use_cuda_graph = False
252
+ self.graph_encoder = {}
253
+ self.graph_up_encoder = {}
254
+ self.inference_buffers_encoder = {}
255
+ self.inference_buffers_up_encoder = {}
256
+ self.max_static_time = 1500
257
+
258
+ # FIXME(sfy) revert hard-coded bfloat16
259
+ # this method is skipped in CausalMaskedDiffWithXvec.scatter_cuda_graph
260
+ def scatter_cuda_graph(self, enable_cuda_graph: bool):
261
+ self.enable_cuda_graph = enable_cuda_graph
262
+ if self.enable_cuda_graph:
263
+ self._init_cuda_graph()
264
+
265
+ def _init_cuda_graph(self):
266
+ """初始化 CUDA Graph"""
267
+
268
+ for l in range(100, 1500, 10):
269
+ static_x = torch.zeros((1, l, 512),
270
+ dtype=torch.float32, device=torch.device('cuda'))
271
+ static_mask = torch.ones((1, 1, l),
272
+ dtype=torch.bool, device=torch.device('cuda'))
273
+ static_pos_emb = torch.zeros((1, 2*l-1, 512),
274
+ dtype=torch.float32, device=torch.device('cuda'))
275
+
276
+ static_inputs = [
277
+ static_x,
278
+ static_mask,
279
+ static_pos_emb,
280
+ ]
281
+
282
+ self._forward_impl_encoder(
283
+ static_inputs[0],
284
+ static_inputs[1],
285
+ static_inputs[2],
286
+ )
287
+ graph = torch.cuda.CUDAGraph()
288
+ with torch.no_grad():
289
+ with torch.cuda.graph(graph):
290
+ static_out_x = self._forward_impl_encoder(
291
+ static_inputs[0],
292
+ static_inputs[1],
293
+ static_inputs[2]
294
+ )
295
+ self.graph_encoder[l] = graph
296
+ static_outputs = [
297
+ static_out_x,
298
+ ]
299
+ self.inference_buffers_encoder[l] = {
300
+ 'static_inputs': static_inputs,
301
+ 'static_outputs': static_outputs
302
+ }
303
+
304
+ for l in range(100, 1500, 10):
305
+ static_x = torch.zeros((1, l, 512),
306
+ dtype=torch.float32, device=torch.device('cuda'))
307
+ static_mask = torch.ones((1, 1, l),
308
+ dtype=torch.bool, device=torch.device('cuda'))
309
+ static_pos_emb = torch.zeros((1, 2*l-1, 512),
310
+ dtype=torch.float32, device=torch.device('cuda'))
311
+
312
+ static_inputs = [
313
+ static_x,
314
+ static_mask,
315
+ static_pos_emb,
316
+ ]
317
+
318
+ self._forward_impl_up_encoder(
319
+ static_inputs[0],
320
+ static_inputs[1],
321
+ static_inputs[2],
322
+ )
323
+ graph = torch.cuda.CUDAGraph()
324
+ with torch.no_grad():
325
+ with torch.cuda.graph(graph):
326
+ static_out_x = self._forward_impl_up_encoder(
327
+ static_inputs[0],
328
+ static_inputs[1],
329
+ static_inputs[2]
330
+ )
331
+ self.graph_up_encoder[l] = graph
332
+ static_outputs = [
333
+ static_out_x,
334
+ ]
335
+ self.inference_buffers_up_encoder[l] = {
336
+ 'static_inputs': static_inputs,
337
+ 'static_outputs': static_outputs
338
+ }
339
+
340
+ self.use_cuda_graph = True
341
+ print("CUDA Graph initialized successfully for encoder and up_encoder")
342
+
343
+ # @torch.compile(dynamic=True,backend="eager")
344
+ def _forward_impl_encoder(self,
345
+ x: torch.Tensor,
346
+ mask: torch.Tensor,
347
+ pos_emb: torch.Tensor):
348
+ for layer in self.encoders:
349
+ x, _, _, _ = layer(x, mask, pos_emb)
350
+ return x
351
+
352
+ # @torch.compile(dynamic=True,backend="eager")
353
+ def _forward_impl_up_encoder(self,
354
+ x: torch.Tensor,
355
+ mask: torch.Tensor,
356
+ pos_emb: torch.Tensor):
357
+ for layer in self.up_encoders:
358
+ x, _, _, _ = layer(x, mask, pos_emb)
359
+ return x
360
+
361
+ def output_size(self) -> int:
362
+ return self._output_size
363
+
364
+ # @torch.compile(dynamic=True,backend="eager")
365
+ def forward(
366
+ self,
367
+ xs: torch.Tensor,
368
+ xs_lens: torch.Tensor,
369
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
370
+ # (sfy) chunk training strategy should not be open-sourced
371
+ T = xs.size(1)
372
+ masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
373
+ xs, pos_emb, masks = self.embed(xs, masks)
374
+
375
+ # lookahead
376
+ xs = self.pre_lookahead_layer(xs)
377
+ # conformer block
378
+ if self.enable_cuda_graph and xs.shape[1] in self.graph_encoder:
379
+ self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][0].copy_(xs)
380
+ self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][1].copy_(masks)
381
+ self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][2].copy_(pos_emb)
382
+ self.graph_encoder[xs.shape[1]].replay()
383
+ xs = self.inference_buffers_encoder[xs.shape[1]]['static_outputs'][0]
384
+ else:
385
+ xs = self._forward_impl_encoder(xs, masks, pos_emb)
386
+ # upsample
387
+ xs = xs.transpose(1, 2).contiguous()
388
+ xs, xs_lens = self.up_layer(xs, xs_lens)
389
+ xs = xs.transpose(1, 2).contiguous()
390
+
391
+ # 2nd conformer block
392
+ T = xs.size(1)
393
+ masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
394
+ xs, pos_emb, masks = self.up_embed(xs, masks)
395
+ if self.enable_cuda_graph and xs.shape[1] in self.graph_up_encoder:
396
+ self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][0].copy_(xs)
397
+ self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][1].copy_(masks)
398
+ self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][2].copy_(pos_emb)
399
+ self.graph_up_encoder[xs.shape[1]].replay()
400
+ xs = self.inference_buffers_up_encoder[xs.shape[1]]['static_outputs'][0]
401
+ else:
402
+ xs = self._forward_impl_up_encoder(xs, masks, pos_emb)
403
+ # post norm
404
+ if self.normalize_before:
405
+ xs = self.after_norm(xs)
406
+ return xs, masks
407
+
408
+ # @torch.compile(dynamic=True,backend="eager")
409
+ def forward_chunk(self,
410
+ xs: torch.Tensor,
411
+ last_chunk: bool = False,
412
+ cnn_cache: torch.Tensor = None,
413
+ att_cache: torch.Tensor = None,
414
+ ):
415
+ """
416
+ Args:
417
+ xs: shape (b, dt, c)
418
+ last_chunk: bool. If last chunk, will pad input with lookaheads
419
+ att_cache: shape (depth1+depth2, b, nh, 2*t1, c).
420
+ cnn_cache: shape (b, c, t1+t2). Where t1=2 (pre_lookahead_layer), t2=4 (up_layer)
421
+ """
422
+ if att_cache is not None:
423
+ assert att_cache.shape[3] % 2 == 0, att_cache.shape
424
+ if cnn_cache is not None:
425
+ assert cnn_cache.shape[2] == 2+self.up_layer.stride*2, cnn_cache.shape
426
+
427
+ # unpack caches
428
+ offset1 = att_cache.shape[3] // 2 if att_cache is not None else 0
429
+ att_cache1 = att_cache[:len(self.encoders), :, :, :offset1] if att_cache is not None else [None] * len(self.encoders)
430
+ att_cache2 = att_cache[len(self.encoders):] if att_cache is not None else [None] * len(self.encoders)
431
+ cnn_cache1 = cnn_cache[:, :, :2] if cnn_cache is not None else None
432
+ cnn_cache2 = cnn_cache[:, :, 2:] if cnn_cache is not None else None
433
+ xs, _, _ = self.embed(xs, None)
434
+ if last_chunk:
435
+ xs = F.pad(xs, (0, 0, 0, self.pre_lookahead_layer.pre_lookahead_len))
436
+
437
+ # this_cnn_cache: shape (b=1, c=512, t=2)
438
+ xs, new_cnn_cache1 = self.pre_lookahead_layer.forward_chunk(xs, cache=cnn_cache1)
439
+
440
+ # remake pos_emb, offset param is ignored by position_encoding
441
+ pos_emb = self.embed.position_encoding(offset=None, size=offset1 + xs.shape[1])
442
+
443
+ # first conformer
444
+ chunk_masks = torch.zeros((0, 0, 0))
445
+ new_att_cache1 = []
446
+
447
+ for idx, layer in enumerate(self.encoders):
448
+ # this_att_cache: shape (b, nh, t, c * 2)
449
+ xs, _, this_new_att_cache1, _ = layer(xs, chunk_masks, pos_emb, att_cache=att_cache1[idx])
450
+ new_att_cache1.append(this_new_att_cache1)
451
+ new_att_cache1 = torch.stack(new_att_cache1, dim=0)
452
+
453
+ # upsample + conformer encoder, xs: (b, t, c) -> (b, c, t)
454
+ xs = xs.transpose(1, 2).contiguous()
455
+ # this_cnn_cache: shape (b=1, c=512, t=2*2)
456
+ xs, _, new_cnn_cache2 = self.up_layer.forward_chunk(xs, None, cache=cnn_cache2)
457
+ xs = xs.transpose(1, 2).contiguous()
458
+
459
+ # at this time, xs are doubled in length
460
+ xs, _, _ = self.up_embed(xs, None)
461
+
462
+ # remake pos_emb
463
+ pos_emb = self.embed.position_encoding(offset=None, size=offset1 * self.up_layer.stride + xs.shape[1])
464
+
465
+ # second conformer
466
+ chunk_masks = torch.zeros((0, 0, 0),dtype=torch.bfloat16)
467
+ new_att_cache2 = []
468
+
469
+ for idx, layer in enumerate(self.up_encoders):
470
+ xs, _, this_new_att_cache2, _ = layer(xs, chunk_masks, pos_emb, att_cache=att_cache2[idx])
471
+ new_att_cache2.append(this_new_att_cache2)
472
+ new_att_cache2 = torch.stack(new_att_cache2, dim=0)
473
+
474
+ if self.normalize_before:
475
+ xs = self.after_norm(xs)
476
+
477
+ # pack new cache
478
+ new_att_cache = torch.cat([new_att_cache1.repeat(1, 1, 1, 2, 1), new_att_cache2], dim=0)
479
+ new_cnn_cache = torch.cat([new_cnn_cache1, new_cnn_cache2], dim=2)
480
+
481
+ return xs, new_cnn_cache, new_att_cache
482
+
483
+
@@ -0,0 +1 @@
1
+ """CosyVoice2 utils subpackage."""
@@ -0,0 +1,41 @@
1
+ # Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
2
+ # 2024 Alibaba Inc (authors: Xiang Lyu)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import torch
16
+
17
+ from stepaudio2.cosyvoice2.transformer.subsampling import LinearNoSubsampling
18
+ from stepaudio2.cosyvoice2.transformer.attention import RelPositionMultiHeadedAttention
19
+ from stepaudio2.cosyvoice2.transformer.embedding import EspnetRelPositionalEncoding
20
+
21
+
22
+ COSYVOICE_ACTIVATION_CLASSES = {
23
+ "hardtanh": torch.nn.Hardtanh,
24
+ "tanh": torch.nn.Tanh,
25
+ "relu": torch.nn.ReLU,
26
+ "selu": torch.nn.SELU,
27
+ "swish": torch.nn.SiLU,
28
+ "gelu": torch.nn.GELU,
29
+ }
30
+
31
+ COSYVOICE_SUBSAMPLE_CLASSES = {
32
+ "linear": LinearNoSubsampling,
33
+ }
34
+
35
+ COSYVOICE_EMB_CLASSES = {
36
+ "rel_pos_espnet": EspnetRelPositionalEncoding,
37
+ }
38
+
39
+ COSYVOICE_ATTENTION_CLASSES = {
40
+ "rel_selfattn": RelPositionMultiHeadedAttention,
41
+ }