xinference 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/__init__.py +0 -1
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +26 -4
- xinference/client/restful/restful_client.py +16 -1
- xinference/core/chat_interface.py +2 -2
- xinference/core/model.py +8 -3
- xinference/core/scheduler.py +4 -4
- xinference/model/audio/core.py +5 -2
- xinference/model/audio/cosyvoice.py +136 -0
- xinference/model/audio/model_spec.json +24 -0
- xinference/model/audio/model_spec_modelscope.json +27 -0
- xinference/model/flexible/launchers/__init__.py +1 -0
- xinference/model/flexible/launchers/image_process_launcher.py +70 -0
- xinference/model/image/model_spec.json +7 -0
- xinference/model/image/stable_diffusion/core.py +6 -1
- xinference/model/llm/llm_family.json +802 -82
- xinference/model/llm/llm_family_csghub.json +39 -0
- xinference/model/llm/llm_family_modelscope.json +295 -47
- xinference/model/llm/pytorch/chatglm.py +243 -5
- xinference/model/llm/pytorch/cogvlm2.py +1 -1
- xinference/model/llm/utils.py +78 -1
- xinference/model/llm/vllm/core.py +8 -0
- xinference/thirdparty/cosyvoice/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
- xinference/thirdparty/cosyvoice/bin/train.py +136 -0
- xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
- xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
- xinference/thirdparty/cosyvoice/cli/model.py +60 -0
- xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
- xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
- xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
- xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
- xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
- xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
- xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
- xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
- xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
- xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
- xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
- xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
- xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
- xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
- xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
- xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
- xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
- xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
- xinference/thirdparty/cosyvoice/utils/common.py +103 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
- xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.95c1d652.js → main.2ef0cfaf.js} +3 -3
- xinference/web/ui/build/static/js/main.2ef0cfaf.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6807ecc0c231fea699533518a0eb2a2bf68a081ce00d452be40600dbffa17a7.json +1 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/METADATA +16 -8
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/RECORD +76 -32
- xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
- /xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.2ef0cfaf.js.LICENSE.txt} +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/LICENSE +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/WHEEL +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
|
|
2
|
+
# 2024 Alibaba Inc (Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# Modified from ESPnet(https://github.com/espnet/espnet)
|
|
16
|
+
"""Subsampling layer definition."""
|
|
17
|
+
|
|
18
|
+
from typing import Tuple, Union
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseSubsampling(torch.nn.Module):
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
self.right_context = 0
|
|
28
|
+
self.subsampling_rate = 1
|
|
29
|
+
|
|
30
|
+
def position_encoding(self, offset: Union[int, torch.Tensor],
|
|
31
|
+
size: int) -> torch.Tensor:
|
|
32
|
+
return self.pos_enc.position_encoding(offset, size)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class EmbedinigNoSubsampling(BaseSubsampling):
|
|
36
|
+
"""Embedding input without subsampling
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
40
|
+
pos_enc_class: torch.nn.Module):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.embed = torch.nn.Embedding(idim, odim)
|
|
43
|
+
self.pos_enc = pos_enc_class
|
|
44
|
+
|
|
45
|
+
def forward(
|
|
46
|
+
self,
|
|
47
|
+
x: torch.Tensor,
|
|
48
|
+
x_mask: torch.Tensor,
|
|
49
|
+
offset: Union[int, torch.Tensor] = 0
|
|
50
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
51
|
+
"""Input x.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
55
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
torch.Tensor: linear input tensor (#batch, time', odim),
|
|
59
|
+
where time' = time .
|
|
60
|
+
torch.Tensor: linear input mask (#batch, 1, time'),
|
|
61
|
+
where time' = time .
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
x = self.embed(x)
|
|
65
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
66
|
+
return x, pos_emb, x_mask
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class LinearNoSubsampling(BaseSubsampling):
|
|
70
|
+
"""Linear transform the input without subsampling
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
idim (int): Input dimension.
|
|
74
|
+
odim (int): Output dimension.
|
|
75
|
+
dropout_rate (float): Dropout rate.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
80
|
+
pos_enc_class: torch.nn.Module):
|
|
81
|
+
"""Construct an linear object."""
|
|
82
|
+
super().__init__()
|
|
83
|
+
self.out = torch.nn.Sequential(
|
|
84
|
+
torch.nn.Linear(idim, odim),
|
|
85
|
+
torch.nn.LayerNorm(odim, eps=1e-5),
|
|
86
|
+
torch.nn.Dropout(dropout_rate),
|
|
87
|
+
)
|
|
88
|
+
self.pos_enc = pos_enc_class
|
|
89
|
+
self.right_context = 0
|
|
90
|
+
self.subsampling_rate = 1
|
|
91
|
+
|
|
92
|
+
def forward(
|
|
93
|
+
self,
|
|
94
|
+
x: torch.Tensor,
|
|
95
|
+
x_mask: torch.Tensor,
|
|
96
|
+
offset: Union[int, torch.Tensor] = 0
|
|
97
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
98
|
+
"""Input x.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
102
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
torch.Tensor: linear input tensor (#batch, time', odim),
|
|
106
|
+
where time' = time .
|
|
107
|
+
torch.Tensor: linear input mask (#batch, 1, time'),
|
|
108
|
+
where time' = time .
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
x = self.out(x)
|
|
112
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
113
|
+
return x, pos_emb, x_mask
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class Conv1dSubsampling2(BaseSubsampling):
|
|
117
|
+
"""Convolutional 1D subsampling (to 1/2 length).
|
|
118
|
+
It is designed for Whisper, ref:
|
|
119
|
+
https://github.com/openai/whisper/blob/main/whisper/model.py
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
idim (int): Input dimension.
|
|
123
|
+
odim (int): Output dimension.
|
|
124
|
+
dropout_rate (float): Dropout rate.
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
129
|
+
pos_enc_class: torch.nn.Module):
|
|
130
|
+
"""Construct an Conv1dSubsampling2 object."""
|
|
131
|
+
super().__init__()
|
|
132
|
+
self.conv = torch.nn.Sequential(
|
|
133
|
+
torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1),
|
|
134
|
+
torch.nn.GELU(),
|
|
135
|
+
torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1),
|
|
136
|
+
torch.nn.GELU(),
|
|
137
|
+
)
|
|
138
|
+
self.pos_enc = pos_enc_class
|
|
139
|
+
# The right context for every conv layer is computed by:
|
|
140
|
+
# (kernel_size - 1) * frame_rate_of_this_layer
|
|
141
|
+
self.subsampling_rate = 2
|
|
142
|
+
# 4 = (3 - 1) * 1 + (3 - 1) * 1
|
|
143
|
+
self.right_context = 4
|
|
144
|
+
|
|
145
|
+
def forward(
|
|
146
|
+
self,
|
|
147
|
+
x: torch.Tensor,
|
|
148
|
+
x_mask: torch.Tensor,
|
|
149
|
+
offset: Union[int, torch.Tensor] = 0
|
|
150
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
151
|
+
"""Subsample x.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
155
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
|
159
|
+
where time' = time // 2.
|
|
160
|
+
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
|
161
|
+
where time' = time // 2.
|
|
162
|
+
torch.Tensor: positional encoding
|
|
163
|
+
|
|
164
|
+
"""
|
|
165
|
+
time = x.size(1)
|
|
166
|
+
x = x.transpose(1, 2) # (b, f, t)
|
|
167
|
+
x = self.conv(x)
|
|
168
|
+
x = x.transpose(1, 2) # (b, t, f)
|
|
169
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
170
|
+
return x, pos_emb, x_mask[:, :, (time + 1) % 2::2]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class Conv2dSubsampling4(BaseSubsampling):
|
|
174
|
+
"""Convolutional 2D subsampling (to 1/4 length).
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
idim (int): Input dimension.
|
|
178
|
+
odim (int): Output dimension.
|
|
179
|
+
dropout_rate (float): Dropout rate.
|
|
180
|
+
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
184
|
+
pos_enc_class: torch.nn.Module):
|
|
185
|
+
"""Construct an Conv2dSubsampling4 object."""
|
|
186
|
+
super().__init__()
|
|
187
|
+
self.conv = torch.nn.Sequential(
|
|
188
|
+
torch.nn.Conv2d(1, odim, 3, 2),
|
|
189
|
+
torch.nn.ReLU(),
|
|
190
|
+
torch.nn.Conv2d(odim, odim, 3, 2),
|
|
191
|
+
torch.nn.ReLU(),
|
|
192
|
+
)
|
|
193
|
+
self.out = torch.nn.Sequential(
|
|
194
|
+
torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
|
|
195
|
+
self.pos_enc = pos_enc_class
|
|
196
|
+
# The right context for every conv layer is computed by:
|
|
197
|
+
# (kernel_size - 1) * frame_rate_of_this_layer
|
|
198
|
+
self.subsampling_rate = 4
|
|
199
|
+
# 6 = (3 - 1) * 1 + (3 - 1) * 2
|
|
200
|
+
self.right_context = 6
|
|
201
|
+
|
|
202
|
+
def forward(
|
|
203
|
+
self,
|
|
204
|
+
x: torch.Tensor,
|
|
205
|
+
x_mask: torch.Tensor,
|
|
206
|
+
offset: Union[int, torch.Tensor] = 0
|
|
207
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
208
|
+
"""Subsample x.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
212
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
|
216
|
+
where time' = time // 4.
|
|
217
|
+
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
|
218
|
+
where time' = time // 4.
|
|
219
|
+
torch.Tensor: positional encoding
|
|
220
|
+
|
|
221
|
+
"""
|
|
222
|
+
x = x.unsqueeze(1) # (b, c=1, t, f)
|
|
223
|
+
x = self.conv(x)
|
|
224
|
+
b, c, t, f = x.size()
|
|
225
|
+
x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
|
226
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
227
|
+
return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class Conv2dSubsampling6(BaseSubsampling):
|
|
231
|
+
"""Convolutional 2D subsampling (to 1/6 length).
|
|
232
|
+
Args:
|
|
233
|
+
idim (int): Input dimension.
|
|
234
|
+
odim (int): Output dimension.
|
|
235
|
+
dropout_rate (float): Dropout rate.
|
|
236
|
+
pos_enc (torch.nn.Module): Custom position encoding layer.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
240
|
+
pos_enc_class: torch.nn.Module):
|
|
241
|
+
"""Construct an Conv2dSubsampling6 object."""
|
|
242
|
+
super().__init__()
|
|
243
|
+
self.conv = torch.nn.Sequential(
|
|
244
|
+
torch.nn.Conv2d(1, odim, 3, 2),
|
|
245
|
+
torch.nn.ReLU(),
|
|
246
|
+
torch.nn.Conv2d(odim, odim, 5, 3),
|
|
247
|
+
torch.nn.ReLU(),
|
|
248
|
+
)
|
|
249
|
+
self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
|
|
250
|
+
odim)
|
|
251
|
+
self.pos_enc = pos_enc_class
|
|
252
|
+
# 10 = (3 - 1) * 1 + (5 - 1) * 2
|
|
253
|
+
self.subsampling_rate = 6
|
|
254
|
+
self.right_context = 10
|
|
255
|
+
|
|
256
|
+
def forward(
|
|
257
|
+
self,
|
|
258
|
+
x: torch.Tensor,
|
|
259
|
+
x_mask: torch.Tensor,
|
|
260
|
+
offset: Union[int, torch.Tensor] = 0
|
|
261
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
262
|
+
"""Subsample x.
|
|
263
|
+
Args:
|
|
264
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
265
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
|
269
|
+
where time' = time // 6.
|
|
270
|
+
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
|
271
|
+
where time' = time // 6.
|
|
272
|
+
torch.Tensor: positional encoding
|
|
273
|
+
"""
|
|
274
|
+
x = x.unsqueeze(1) # (b, c, t, f)
|
|
275
|
+
x = self.conv(x)
|
|
276
|
+
b, c, t, f = x.size()
|
|
277
|
+
x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
|
278
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
279
|
+
return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class Conv2dSubsampling8(BaseSubsampling):
|
|
283
|
+
"""Convolutional 2D subsampling (to 1/8 length).
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
idim (int): Input dimension.
|
|
287
|
+
odim (int): Output dimension.
|
|
288
|
+
dropout_rate (float): Dropout rate.
|
|
289
|
+
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
293
|
+
pos_enc_class: torch.nn.Module):
|
|
294
|
+
"""Construct an Conv2dSubsampling8 object."""
|
|
295
|
+
super().__init__()
|
|
296
|
+
self.conv = torch.nn.Sequential(
|
|
297
|
+
torch.nn.Conv2d(1, odim, 3, 2),
|
|
298
|
+
torch.nn.ReLU(),
|
|
299
|
+
torch.nn.Conv2d(odim, odim, 3, 2),
|
|
300
|
+
torch.nn.ReLU(),
|
|
301
|
+
torch.nn.Conv2d(odim, odim, 3, 2),
|
|
302
|
+
torch.nn.ReLU(),
|
|
303
|
+
)
|
|
304
|
+
self.linear = torch.nn.Linear(
|
|
305
|
+
odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
|
|
306
|
+
self.pos_enc = pos_enc_class
|
|
307
|
+
self.subsampling_rate = 8
|
|
308
|
+
# 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
|
|
309
|
+
self.right_context = 14
|
|
310
|
+
|
|
311
|
+
def forward(
|
|
312
|
+
self,
|
|
313
|
+
x: torch.Tensor,
|
|
314
|
+
x_mask: torch.Tensor,
|
|
315
|
+
offset: Union[int, torch.Tensor] = 0
|
|
316
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
317
|
+
"""Subsample x.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
321
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
torch.Tensor: Subsampled tensor (#batch, time', odim),
|
|
325
|
+
where time' = time // 8.
|
|
326
|
+
torch.Tensor: Subsampled mask (#batch, 1, time'),
|
|
327
|
+
where time' = time // 8.
|
|
328
|
+
torch.Tensor: positional encoding
|
|
329
|
+
"""
|
|
330
|
+
x = x.unsqueeze(1) # (b, c, t, f)
|
|
331
|
+
x = self.conv(x)
|
|
332
|
+
b, c, t, f = x.size()
|
|
333
|
+
x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
|
|
334
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
335
|
+
return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class LegacyLinearNoSubsampling(BaseSubsampling):
|
|
339
|
+
"""Linear transform the input without subsampling
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
idim (int): Input dimension.
|
|
343
|
+
odim (int): Output dimension.
|
|
344
|
+
dropout_rate (float): Dropout rate.
|
|
345
|
+
|
|
346
|
+
"""
|
|
347
|
+
|
|
348
|
+
def __init__(self, idim: int, odim: int, dropout_rate: float,
|
|
349
|
+
pos_enc_class: torch.nn.Module):
|
|
350
|
+
"""Construct an linear object."""
|
|
351
|
+
super().__init__()
|
|
352
|
+
self.out = torch.nn.Sequential(
|
|
353
|
+
torch.nn.Linear(idim, odim),
|
|
354
|
+
torch.nn.LayerNorm(odim, eps=1e-5),
|
|
355
|
+
torch.nn.Dropout(dropout_rate),
|
|
356
|
+
torch.nn.ReLU(),
|
|
357
|
+
)
|
|
358
|
+
self.pos_enc = pos_enc_class
|
|
359
|
+
self.right_context = 0
|
|
360
|
+
self.subsampling_rate = 1
|
|
361
|
+
|
|
362
|
+
def forward(
|
|
363
|
+
self,
|
|
364
|
+
x: torch.Tensor,
|
|
365
|
+
x_mask: torch.Tensor,
|
|
366
|
+
offset: Union[int, torch.Tensor] = 0
|
|
367
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
368
|
+
"""Input x.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
x (torch.Tensor): Input tensor (#batch, time, idim).
|
|
372
|
+
x_mask (torch.Tensor): Input mask (#batch, 1, time).
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
torch.Tensor: linear input tensor (#batch, time', odim),
|
|
376
|
+
where time' = time .
|
|
377
|
+
torch.Tensor: linear input mask (#batch, 1, time'),
|
|
378
|
+
where time' = time .
|
|
379
|
+
|
|
380
|
+
"""
|
|
381
|
+
x = self.out(x)
|
|
382
|
+
x, pos_emb = self.pos_enc(x, offset)
|
|
383
|
+
return x, pos_emb, x_mask
|
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
|
|
2
|
+
# 2024 Alibaba Inc (authors: Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import torch
|
|
16
|
+
|
|
17
|
+
from cosyvoice.transformer.activation import Swish
|
|
18
|
+
from cosyvoice.transformer.subsampling import (
|
|
19
|
+
LinearNoSubsampling,
|
|
20
|
+
EmbedinigNoSubsampling,
|
|
21
|
+
Conv1dSubsampling2,
|
|
22
|
+
Conv2dSubsampling4,
|
|
23
|
+
Conv2dSubsampling6,
|
|
24
|
+
Conv2dSubsampling8,
|
|
25
|
+
)
|
|
26
|
+
from cosyvoice.transformer.embedding import (PositionalEncoding,
|
|
27
|
+
RelPositionalEncoding,
|
|
28
|
+
WhisperPositionalEncoding,
|
|
29
|
+
LearnablePositionalEncoding,
|
|
30
|
+
NoPositionalEncoding)
|
|
31
|
+
from cosyvoice.transformer.attention import (MultiHeadedAttention,
|
|
32
|
+
RelPositionMultiHeadedAttention)
|
|
33
|
+
from cosyvoice.transformer.embedding import EspnetRelPositionalEncoding
|
|
34
|
+
from cosyvoice.transformer.subsampling import LegacyLinearNoSubsampling
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
COSYVOICE_ACTIVATION_CLASSES = {
|
|
38
|
+
"hardtanh": torch.nn.Hardtanh,
|
|
39
|
+
"tanh": torch.nn.Tanh,
|
|
40
|
+
"relu": torch.nn.ReLU,
|
|
41
|
+
"selu": torch.nn.SELU,
|
|
42
|
+
"swish": getattr(torch.nn, "SiLU", Swish),
|
|
43
|
+
"gelu": torch.nn.GELU,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
COSYVOICE_SUBSAMPLE_CLASSES = {
|
|
47
|
+
"linear": LinearNoSubsampling,
|
|
48
|
+
"linear_legacy": LegacyLinearNoSubsampling,
|
|
49
|
+
"embed": EmbedinigNoSubsampling,
|
|
50
|
+
"conv1d2": Conv1dSubsampling2,
|
|
51
|
+
"conv2d": Conv2dSubsampling4,
|
|
52
|
+
"conv2d6": Conv2dSubsampling6,
|
|
53
|
+
"conv2d8": Conv2dSubsampling8,
|
|
54
|
+
'paraformer_dummy': torch.nn.Identity
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
COSYVOICE_EMB_CLASSES = {
|
|
58
|
+
"embed": PositionalEncoding,
|
|
59
|
+
"abs_pos": PositionalEncoding,
|
|
60
|
+
"rel_pos": RelPositionalEncoding,
|
|
61
|
+
"rel_pos_espnet": EspnetRelPositionalEncoding,
|
|
62
|
+
"no_pos": NoPositionalEncoding,
|
|
63
|
+
"abs_pos_whisper": WhisperPositionalEncoding,
|
|
64
|
+
"embed_learnable_pe": LearnablePositionalEncoding,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
COSYVOICE_ATTENTION_CLASSES = {
|
|
68
|
+
"selfattn": MultiHeadedAttention,
|
|
69
|
+
"rel_selfattn": RelPositionMultiHeadedAttention,
|
|
70
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
|
|
2
|
+
# 2024 Alibaba Inc (authors: Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# Modified from ESPnet(https://github.com/espnet/espnet)
|
|
16
|
+
"""Unility functions for Transformer."""
|
|
17
|
+
|
|
18
|
+
from typing import List
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
|
|
22
|
+
IGNORE_ID = -1
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def pad_list(xs: List[torch.Tensor], pad_value: int):
|
|
26
|
+
"""Perform padding for the list of tensors.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
|
|
30
|
+
pad_value (float): Value for padding.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Tensor: Padded tensor (B, Tmax, `*`).
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
>>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
|
|
37
|
+
>>> x
|
|
38
|
+
[tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
|
|
39
|
+
>>> pad_list(x, 0)
|
|
40
|
+
tensor([[1., 1., 1., 1.],
|
|
41
|
+
[1., 1., 0., 0.],
|
|
42
|
+
[1., 0., 0., 0.]])
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
max_len = max([len(item) for item in xs])
|
|
46
|
+
batchs = len(xs)
|
|
47
|
+
ndim = xs[0].ndim
|
|
48
|
+
if ndim == 1:
|
|
49
|
+
pad_res = torch.zeros(batchs,
|
|
50
|
+
max_len,
|
|
51
|
+
dtype=xs[0].dtype,
|
|
52
|
+
device=xs[0].device)
|
|
53
|
+
elif ndim == 2:
|
|
54
|
+
pad_res = torch.zeros(batchs,
|
|
55
|
+
max_len,
|
|
56
|
+
xs[0].shape[1],
|
|
57
|
+
dtype=xs[0].dtype,
|
|
58
|
+
device=xs[0].device)
|
|
59
|
+
elif ndim == 3:
|
|
60
|
+
pad_res = torch.zeros(batchs,
|
|
61
|
+
max_len,
|
|
62
|
+
xs[0].shape[1],
|
|
63
|
+
xs[0].shape[2],
|
|
64
|
+
dtype=xs[0].dtype,
|
|
65
|
+
device=xs[0].device)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError(f"Unsupported ndim: {ndim}")
|
|
68
|
+
pad_res.fill_(pad_value)
|
|
69
|
+
for i in range(batchs):
|
|
70
|
+
pad_res[i, :len(xs[i])] = xs[i]
|
|
71
|
+
return pad_res
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
|
|
75
|
+
ignore_label: int) -> torch.Tensor:
|
|
76
|
+
"""Calculate accuracy.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
|
|
80
|
+
pad_targets (LongTensor): Target label tensors (B, Lmax).
|
|
81
|
+
ignore_label (int): Ignore label id.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
torch.Tensor: Accuracy value (0.0 - 1.0).
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
|
|
88
|
+
pad_outputs.size(1)).argmax(2)
|
|
89
|
+
mask = pad_targets != ignore_label
|
|
90
|
+
numerator = torch.sum(
|
|
91
|
+
pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
|
|
92
|
+
denominator = torch.sum(mask)
|
|
93
|
+
return (numerator / denominator).detach()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_padding(kernel_size, dilation=1):
|
|
97
|
+
return int((kernel_size * dilation - dilation) / 2)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def init_weights(m, mean=0.0, std=0.01):
|
|
101
|
+
classname = m.__class__.__name__
|
|
102
|
+
if classname.find("Conv") != -1:
|
|
103
|
+
m.weight.data.normal_(mean, std)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
|
|
2
|
+
# 2024 Alibaba Inc (authors: Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
from contextlib import nullcontext
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
import torch.distributed as dist
|
|
22
|
+
|
|
23
|
+
from cosyvoice.utils.train_utils import update_parameter_and_lr, log_per_step, log_per_save, batch_forward, batch_backward, save_model, cosyvoice_join
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Executor:
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.step = 0
|
|
30
|
+
self.epoch = 0
|
|
31
|
+
self.rank = int(os.environ.get('RANK', 0))
|
|
32
|
+
self.device = torch.device('cuda:{}'.format(self.rank))
|
|
33
|
+
|
|
34
|
+
def train_one_epoc(self, model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join):
|
|
35
|
+
''' Train one epoch
|
|
36
|
+
'''
|
|
37
|
+
|
|
38
|
+
lr = optimizer.param_groups[0]['lr']
|
|
39
|
+
logging.info('Epoch {} TRAIN info lr {} rank {}'.format(self.epoch, lr, self.rank))
|
|
40
|
+
logging.info('using accumulate grad, new batch size is {} times'
|
|
41
|
+
' larger than before'.format(info_dict['accum_grad']))
|
|
42
|
+
# A context manager to be used in conjunction with an instance of
|
|
43
|
+
# torch.nn.parallel.DistributedDataParallel to be able to train
|
|
44
|
+
# with uneven inputs across participating processes.
|
|
45
|
+
model.train()
|
|
46
|
+
model_context = model.join if info_dict['train_engine'] == 'torch_ddp' else nullcontext
|
|
47
|
+
with model_context():
|
|
48
|
+
for batch_idx, batch_dict in enumerate(train_data_loader):
|
|
49
|
+
info_dict["tag"] = "TRAIN"
|
|
50
|
+
info_dict["step"] = self.step
|
|
51
|
+
info_dict["epoch"] = self.epoch
|
|
52
|
+
info_dict["batch_idx"] = batch_idx
|
|
53
|
+
if cosyvoice_join(group_join, info_dict):
|
|
54
|
+
break
|
|
55
|
+
|
|
56
|
+
# Disable gradient synchronizations across DDP processes.
|
|
57
|
+
# Within this context, gradients will be accumulated on module
|
|
58
|
+
# variables, which will later be synchronized.
|
|
59
|
+
if info_dict['train_engine'] == 'torch_ddp' and (batch_idx + 1) % info_dict["accum_grad"] != 0:
|
|
60
|
+
context = model.no_sync
|
|
61
|
+
# Used for single gpu training and DDP gradient synchronization
|
|
62
|
+
# processes.
|
|
63
|
+
else:
|
|
64
|
+
context = nullcontext
|
|
65
|
+
|
|
66
|
+
with context():
|
|
67
|
+
info_dict = batch_forward(model, batch_dict, info_dict)
|
|
68
|
+
info_dict = batch_backward(model, info_dict)
|
|
69
|
+
|
|
70
|
+
info_dict = update_parameter_and_lr(model, optimizer, scheduler, info_dict)
|
|
71
|
+
log_per_step(writer, info_dict)
|
|
72
|
+
# NOTE specify save_per_step in cosyvoice.yaml if you want to enable step save
|
|
73
|
+
if info_dict['save_per_step'] > 0 and (self.step + 1) % info_dict['save_per_step'] == 0 and (batch_idx + 1) % info_dict["accum_grad"] == 0:
|
|
74
|
+
dist.barrier()
|
|
75
|
+
self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=False)
|
|
76
|
+
model.train()
|
|
77
|
+
if (batch_idx + 1) % info_dict["accum_grad"] == 0:
|
|
78
|
+
self.step += 1
|
|
79
|
+
dist.barrier()
|
|
80
|
+
self.cv(model, cv_data_loader, writer, info_dict, on_batch_end=True)
|
|
81
|
+
|
|
82
|
+
@torch.inference_mode()
|
|
83
|
+
def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=True):
|
|
84
|
+
''' Cross validation on
|
|
85
|
+
'''
|
|
86
|
+
logging.info('Epoch {} Step {} on_batch_end {} CV rank {}'.format(self.epoch, self.step + 1, on_batch_end, self.rank))
|
|
87
|
+
model.eval()
|
|
88
|
+
total_num_utts, total_loss_dict = 0, {} # avoid division by 0
|
|
89
|
+
for batch_idx, batch_dict in enumerate(cv_data_loader):
|
|
90
|
+
info_dict["tag"] = "CV"
|
|
91
|
+
info_dict["step"] = self.step
|
|
92
|
+
info_dict["epoch"] = self.epoch
|
|
93
|
+
info_dict["batch_idx"] = batch_idx
|
|
94
|
+
|
|
95
|
+
num_utts = len(batch_dict["utts"])
|
|
96
|
+
total_num_utts += num_utts
|
|
97
|
+
|
|
98
|
+
info_dict = batch_forward(model, batch_dict, info_dict)
|
|
99
|
+
|
|
100
|
+
for k, v in info_dict['loss_dict'].items():
|
|
101
|
+
if k not in total_loss_dict:
|
|
102
|
+
total_loss_dict[k] = []
|
|
103
|
+
total_loss_dict[k].append(v.item() * num_utts)
|
|
104
|
+
log_per_step(None, info_dict)
|
|
105
|
+
for k, v in total_loss_dict.items():
|
|
106
|
+
total_loss_dict[k] = sum(v) / total_num_utts
|
|
107
|
+
info_dict['loss_dict'] = total_loss_dict
|
|
108
|
+
log_per_save(writer, info_dict)
|
|
109
|
+
model_name = 'epoch_{}_whole'.format(self.epoch) if on_batch_end else 'epoch_{}_step_{}'.format(self.epoch, self.step + 1)
|
|
110
|
+
save_model(model, model_name, info_dict)
|