xinference 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/__init__.py +0 -1
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +30 -5
- xinference/client/restful/restful_client.py +18 -3
- xinference/constants.py +0 -4
- xinference/core/chat_interface.py +2 -2
- xinference/core/image_interface.py +6 -3
- xinference/core/model.py +9 -4
- xinference/core/scheduler.py +4 -4
- xinference/core/supervisor.py +2 -0
- xinference/core/worker.py +7 -0
- xinference/deploy/utils.py +6 -0
- xinference/model/audio/core.py +9 -4
- xinference/model/audio/cosyvoice.py +136 -0
- xinference/model/audio/model_spec.json +24 -0
- xinference/model/audio/model_spec_modelscope.json +27 -0
- xinference/model/core.py +25 -4
- xinference/model/embedding/core.py +88 -13
- xinference/model/embedding/model_spec.json +8 -0
- xinference/model/embedding/model_spec_modelscope.json +8 -0
- xinference/model/flexible/core.py +8 -2
- xinference/model/flexible/launchers/__init__.py +1 -0
- xinference/model/flexible/launchers/image_process_launcher.py +70 -0
- xinference/model/image/core.py +8 -5
- xinference/model/image/model_spec.json +36 -5
- xinference/model/image/model_spec_modelscope.json +21 -3
- xinference/model/image/stable_diffusion/core.py +36 -28
- xinference/model/llm/core.py +6 -4
- xinference/model/llm/ggml/llamacpp.py +7 -5
- xinference/model/llm/llm_family.json +802 -82
- xinference/model/llm/llm_family.py +6 -6
- xinference/model/llm/llm_family_csghub.json +39 -0
- xinference/model/llm/llm_family_modelscope.json +295 -47
- xinference/model/llm/mlx/core.py +7 -0
- xinference/model/llm/pytorch/chatglm.py +246 -5
- xinference/model/llm/pytorch/cogvlm2.py +1 -1
- xinference/model/llm/pytorch/deepseek_vl.py +2 -1
- xinference/model/llm/pytorch/falcon.py +2 -1
- xinference/model/llm/pytorch/llama_2.py +4 -2
- xinference/model/llm/pytorch/omnilmm.py +2 -1
- xinference/model/llm/pytorch/qwen_vl.py +2 -1
- xinference/model/llm/pytorch/vicuna.py +2 -1
- xinference/model/llm/pytorch/yi_vl.py +2 -1
- xinference/model/llm/sglang/core.py +12 -6
- xinference/model/llm/utils.py +78 -1
- xinference/model/llm/vllm/core.py +9 -5
- xinference/model/rerank/core.py +4 -3
- xinference/thirdparty/cosyvoice/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
- xinference/thirdparty/cosyvoice/bin/train.py +136 -0
- xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
- xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
- xinference/thirdparty/cosyvoice/cli/model.py +60 -0
- xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
- xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
- xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
- xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
- xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
- xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
- xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
- xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
- xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
- xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
- xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
- xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
- xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
- xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
- xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
- xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
- xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
- xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
- xinference/thirdparty/cosyvoice/utils/common.py +103 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
- xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.95c1d652.js → main.af906659.js} +3 -3
- xinference/web/ui/build/static/js/main.af906659.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2cd5e4279ad7e13a1f41d486e9fca7756295bfad5bd77d90992f4ac3e10b496d.json +1 -0
- {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/METADATA +39 -11
- {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/RECORD +101 -57
- xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
- /xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.af906659.js.LICENSE.txt} +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/LICENSE +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/WHEEL +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/entry_points.txt +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import torch
|
|
15
|
+
import torch.nn as nn
|
|
16
|
+
from einops import pack, rearrange, repeat
|
|
17
|
+
from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
|
|
18
|
+
from matcha.models.components.transformer import BasicTransformerBlock
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ConditionalDecoder(nn.Module):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
in_channels,
|
|
25
|
+
out_channels,
|
|
26
|
+
channels=(256, 256),
|
|
27
|
+
dropout=0.05,
|
|
28
|
+
attention_head_dim=64,
|
|
29
|
+
n_blocks=1,
|
|
30
|
+
num_mid_blocks=2,
|
|
31
|
+
num_heads=4,
|
|
32
|
+
act_fn="snake",
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
This decoder requires an input with the same shape of the target. So, if your text content
|
|
36
|
+
is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
|
|
37
|
+
"""
|
|
38
|
+
super().__init__()
|
|
39
|
+
channels = tuple(channels)
|
|
40
|
+
self.in_channels = in_channels
|
|
41
|
+
self.out_channels = out_channels
|
|
42
|
+
|
|
43
|
+
self.time_embeddings = SinusoidalPosEmb(in_channels)
|
|
44
|
+
time_embed_dim = channels[0] * 4
|
|
45
|
+
self.time_mlp = TimestepEmbedding(
|
|
46
|
+
in_channels=in_channels,
|
|
47
|
+
time_embed_dim=time_embed_dim,
|
|
48
|
+
act_fn="silu",
|
|
49
|
+
)
|
|
50
|
+
self.down_blocks = nn.ModuleList([])
|
|
51
|
+
self.mid_blocks = nn.ModuleList([])
|
|
52
|
+
self.up_blocks = nn.ModuleList([])
|
|
53
|
+
|
|
54
|
+
output_channel = in_channels
|
|
55
|
+
for i in range(len(channels)): # pylint: disable=consider-using-enumerate
|
|
56
|
+
input_channel = output_channel
|
|
57
|
+
output_channel = channels[i]
|
|
58
|
+
is_last = i == len(channels) - 1
|
|
59
|
+
resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
|
|
60
|
+
transformer_blocks = nn.ModuleList(
|
|
61
|
+
[
|
|
62
|
+
BasicTransformerBlock(
|
|
63
|
+
dim=output_channel,
|
|
64
|
+
num_attention_heads=num_heads,
|
|
65
|
+
attention_head_dim=attention_head_dim,
|
|
66
|
+
dropout=dropout,
|
|
67
|
+
activation_fn=act_fn,
|
|
68
|
+
)
|
|
69
|
+
for _ in range(n_blocks)
|
|
70
|
+
]
|
|
71
|
+
)
|
|
72
|
+
downsample = (
|
|
73
|
+
Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
|
|
74
|
+
)
|
|
75
|
+
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
|
|
76
|
+
|
|
77
|
+
for i in range(num_mid_blocks):
|
|
78
|
+
input_channel = channels[-1]
|
|
79
|
+
out_channels = channels[-1]
|
|
80
|
+
resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
|
|
81
|
+
|
|
82
|
+
transformer_blocks = nn.ModuleList(
|
|
83
|
+
[
|
|
84
|
+
BasicTransformerBlock(
|
|
85
|
+
dim=output_channel,
|
|
86
|
+
num_attention_heads=num_heads,
|
|
87
|
+
attention_head_dim=attention_head_dim,
|
|
88
|
+
dropout=dropout,
|
|
89
|
+
activation_fn=act_fn,
|
|
90
|
+
)
|
|
91
|
+
for _ in range(n_blocks)
|
|
92
|
+
]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
|
|
96
|
+
|
|
97
|
+
channels = channels[::-1] + (channels[0],)
|
|
98
|
+
for i in range(len(channels) - 1):
|
|
99
|
+
input_channel = channels[i] * 2
|
|
100
|
+
output_channel = channels[i + 1]
|
|
101
|
+
is_last = i == len(channels) - 2
|
|
102
|
+
resnet = ResnetBlock1D(
|
|
103
|
+
dim=input_channel,
|
|
104
|
+
dim_out=output_channel,
|
|
105
|
+
time_emb_dim=time_embed_dim,
|
|
106
|
+
)
|
|
107
|
+
transformer_blocks = nn.ModuleList(
|
|
108
|
+
[
|
|
109
|
+
BasicTransformerBlock(
|
|
110
|
+
dim=output_channel,
|
|
111
|
+
num_attention_heads=num_heads,
|
|
112
|
+
attention_head_dim=attention_head_dim,
|
|
113
|
+
dropout=dropout,
|
|
114
|
+
activation_fn=act_fn,
|
|
115
|
+
)
|
|
116
|
+
for _ in range(n_blocks)
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
upsample = (
|
|
120
|
+
Upsample1D(output_channel, use_conv_transpose=True)
|
|
121
|
+
if not is_last
|
|
122
|
+
else nn.Conv1d(output_channel, output_channel, 3, padding=1)
|
|
123
|
+
)
|
|
124
|
+
self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
|
|
125
|
+
self.final_block = Block1D(channels[-1], channels[-1])
|
|
126
|
+
self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
|
|
127
|
+
self.initialize_weights()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def initialize_weights(self):
|
|
131
|
+
for m in self.modules():
|
|
132
|
+
if isinstance(m, nn.Conv1d):
|
|
133
|
+
nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
|
|
134
|
+
if m.bias is not None:
|
|
135
|
+
nn.init.constant_(m.bias, 0)
|
|
136
|
+
elif isinstance(m, nn.GroupNorm):
|
|
137
|
+
nn.init.constant_(m.weight, 1)
|
|
138
|
+
nn.init.constant_(m.bias, 0)
|
|
139
|
+
elif isinstance(m, nn.Linear):
|
|
140
|
+
nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
|
|
141
|
+
if m.bias is not None:
|
|
142
|
+
nn.init.constant_(m.bias, 0)
|
|
143
|
+
|
|
144
|
+
def forward(self, x, mask, mu, t, spks=None, cond=None):
|
|
145
|
+
"""Forward pass of the UNet1DConditional model.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
x (torch.Tensor): shape (batch_size, in_channels, time)
|
|
149
|
+
mask (_type_): shape (batch_size, 1, time)
|
|
150
|
+
t (_type_): shape (batch_size)
|
|
151
|
+
spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
|
|
152
|
+
cond (_type_, optional): placeholder for future use. Defaults to None.
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
ValueError: _description_
|
|
156
|
+
ValueError: _description_
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
_type_: _description_
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
t = self.time_embeddings(t)
|
|
163
|
+
t = self.time_mlp(t)
|
|
164
|
+
|
|
165
|
+
x = pack([x, mu], "b * t")[0]
|
|
166
|
+
|
|
167
|
+
if spks is not None:
|
|
168
|
+
spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
|
|
169
|
+
x = pack([x, spks], "b * t")[0]
|
|
170
|
+
if cond is not None:
|
|
171
|
+
x = pack([x, cond], "b * t")[0]
|
|
172
|
+
|
|
173
|
+
hiddens = []
|
|
174
|
+
masks = [mask]
|
|
175
|
+
for resnet, transformer_blocks, downsample in self.down_blocks:
|
|
176
|
+
mask_down = masks[-1]
|
|
177
|
+
x = resnet(x, mask_down, t)
|
|
178
|
+
x = rearrange(x, "b c t -> b t c").contiguous()
|
|
179
|
+
attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
|
|
180
|
+
for transformer_block in transformer_blocks:
|
|
181
|
+
x = transformer_block(
|
|
182
|
+
hidden_states=x,
|
|
183
|
+
attention_mask=attn_mask,
|
|
184
|
+
timestep=t,
|
|
185
|
+
)
|
|
186
|
+
x = rearrange(x, "b t c -> b c t").contiguous()
|
|
187
|
+
hiddens.append(x) # Save hidden states for skip connections
|
|
188
|
+
x = downsample(x * mask_down)
|
|
189
|
+
masks.append(mask_down[:, :, ::2])
|
|
190
|
+
masks = masks[:-1]
|
|
191
|
+
mask_mid = masks[-1]
|
|
192
|
+
|
|
193
|
+
for resnet, transformer_blocks in self.mid_blocks:
|
|
194
|
+
x = resnet(x, mask_mid, t)
|
|
195
|
+
x = rearrange(x, "b c t -> b t c").contiguous()
|
|
196
|
+
attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
|
|
197
|
+
for transformer_block in transformer_blocks:
|
|
198
|
+
x = transformer_block(
|
|
199
|
+
hidden_states=x,
|
|
200
|
+
attention_mask=attn_mask,
|
|
201
|
+
timestep=t,
|
|
202
|
+
)
|
|
203
|
+
x = rearrange(x, "b t c -> b c t").contiguous()
|
|
204
|
+
|
|
205
|
+
for resnet, transformer_blocks, upsample in self.up_blocks:
|
|
206
|
+
mask_up = masks.pop()
|
|
207
|
+
skip = hiddens.pop()
|
|
208
|
+
x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
|
|
209
|
+
x = resnet(x, mask_up, t)
|
|
210
|
+
x = rearrange(x, "b c t -> b t c").contiguous()
|
|
211
|
+
attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
|
|
212
|
+
for transformer_block in transformer_blocks:
|
|
213
|
+
x = transformer_block(
|
|
214
|
+
hidden_states=x,
|
|
215
|
+
attention_mask=attn_mask,
|
|
216
|
+
timestep=t,
|
|
217
|
+
)
|
|
218
|
+
x = rearrange(x, "b t c -> b c t").contiguous()
|
|
219
|
+
x = upsample(x * mask_up)
|
|
220
|
+
x = self.final_block(x, mask_up)
|
|
221
|
+
output = self.final_proj(x * mask_up)
|
|
222
|
+
return output * mask
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Dict, Optional
|
|
16
|
+
import torch
|
|
17
|
+
import torch.nn as nn
|
|
18
|
+
from torch.nn import functional as F
|
|
19
|
+
from omegaconf import DictConfig
|
|
20
|
+
from cosyvoice.utils.mask import make_pad_mask
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MaskedDiffWithXvec(torch.nn.Module):
|
|
24
|
+
def __init__(self,
|
|
25
|
+
input_size: int = 512,
|
|
26
|
+
output_size: int = 80,
|
|
27
|
+
spk_embed_dim: int = 192,
|
|
28
|
+
output_type: str = "mel",
|
|
29
|
+
vocab_size: int = 4096,
|
|
30
|
+
input_frame_rate: int = 50,
|
|
31
|
+
only_mask_loss: bool = True,
|
|
32
|
+
encoder: torch.nn.Module = None,
|
|
33
|
+
length_regulator: torch.nn.Module = None,
|
|
34
|
+
decoder: torch.nn.Module = None,
|
|
35
|
+
decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
|
|
36
|
+
mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.input_size = input_size
|
|
39
|
+
self.output_size = output_size
|
|
40
|
+
self.decoder_conf = decoder_conf
|
|
41
|
+
self.mel_feat_conf = mel_feat_conf
|
|
42
|
+
self.vocab_size = vocab_size
|
|
43
|
+
self.output_type = output_type
|
|
44
|
+
self.input_frame_rate = input_frame_rate
|
|
45
|
+
logging.info(f"input frame rate={self.input_frame_rate}")
|
|
46
|
+
self.input_embedding = nn.Embedding(vocab_size, input_size)
|
|
47
|
+
self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
|
|
48
|
+
self.encoder = encoder
|
|
49
|
+
self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
|
|
50
|
+
self.decoder = decoder
|
|
51
|
+
self.length_regulator = length_regulator
|
|
52
|
+
self.only_mask_loss = only_mask_loss
|
|
53
|
+
|
|
54
|
+
def forward(
|
|
55
|
+
self,
|
|
56
|
+
batch: dict,
|
|
57
|
+
device: torch.device,
|
|
58
|
+
) -> Dict[str, Optional[torch.Tensor]]:
|
|
59
|
+
token = batch['speech_token'].to(device)
|
|
60
|
+
token_len = batch['speech_token_len'].to(device)
|
|
61
|
+
feat = batch['speech_feat'].to(device)
|
|
62
|
+
feat_len = batch['speech_feat_len'].to(device)
|
|
63
|
+
embedding = batch['embedding'].to(device)
|
|
64
|
+
|
|
65
|
+
# xvec projection
|
|
66
|
+
embedding = F.normalize(embedding, dim=1)
|
|
67
|
+
embedding = self.spk_embed_affine_layer(embedding)
|
|
68
|
+
|
|
69
|
+
# concat text and prompt_text
|
|
70
|
+
mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
|
|
71
|
+
token = self.input_embedding(torch.clamp(token, min=0)) * mask
|
|
72
|
+
|
|
73
|
+
# text encode
|
|
74
|
+
h, h_lengths = self.encoder(token, token_len)
|
|
75
|
+
h = self.encoder_proj(h)
|
|
76
|
+
h, h_lengths = self.length_regulator(h, feat_len)
|
|
77
|
+
|
|
78
|
+
# get conditions
|
|
79
|
+
conds = torch.zeros(feat.shape, device=token.device)
|
|
80
|
+
conds = conds.transpose(1, 2)
|
|
81
|
+
|
|
82
|
+
mask = (~make_pad_mask(feat_len)).to(h)
|
|
83
|
+
feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
|
|
84
|
+
loss, _ = self.decoder.compute_loss(
|
|
85
|
+
feat.transpose(1, 2).contiguous(),
|
|
86
|
+
mask.unsqueeze(1),
|
|
87
|
+
h.transpose(1, 2).contiguous(),
|
|
88
|
+
embedding,
|
|
89
|
+
cond=conds
|
|
90
|
+
)
|
|
91
|
+
return {'loss': loss}
|
|
92
|
+
|
|
93
|
+
@torch.inference_mode()
|
|
94
|
+
def inference(self,
|
|
95
|
+
token,
|
|
96
|
+
token_len,
|
|
97
|
+
prompt_token,
|
|
98
|
+
prompt_token_len,
|
|
99
|
+
prompt_feat,
|
|
100
|
+
prompt_feat_len,
|
|
101
|
+
embedding):
|
|
102
|
+
assert token.shape[0] == 1
|
|
103
|
+
# xvec projection
|
|
104
|
+
embedding = F.normalize(embedding, dim=1)
|
|
105
|
+
embedding = self.spk_embed_affine_layer(embedding)
|
|
106
|
+
|
|
107
|
+
# concat text and prompt_text
|
|
108
|
+
token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
|
|
109
|
+
mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
|
|
110
|
+
token = self.input_embedding(torch.clamp(token, min=0)) * mask
|
|
111
|
+
|
|
112
|
+
# text encode
|
|
113
|
+
h, h_lengths = self.encoder(token, token_len)
|
|
114
|
+
h = self.encoder_proj(h)
|
|
115
|
+
feat_len = (token_len / 50 * 22050 / 256).int()
|
|
116
|
+
h, h_lengths = self.length_regulator(h, feat_len)
|
|
117
|
+
|
|
118
|
+
# get conditions
|
|
119
|
+
conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
|
|
120
|
+
if prompt_feat.shape[1] != 0:
|
|
121
|
+
for i, j in enumerate(prompt_feat_len):
|
|
122
|
+
conds[i, :j] = prompt_feat[i]
|
|
123
|
+
conds = conds.transpose(1, 2)
|
|
124
|
+
|
|
125
|
+
mask = (~make_pad_mask(feat_len)).to(h)
|
|
126
|
+
feat = self.decoder(
|
|
127
|
+
mu=h.transpose(1, 2).contiguous(),
|
|
128
|
+
mask=mask.unsqueeze(1),
|
|
129
|
+
spks=embedding,
|
|
130
|
+
cond=conds,
|
|
131
|
+
n_timesteps=10
|
|
132
|
+
)
|
|
133
|
+
if prompt_feat.shape[1] != 0:
|
|
134
|
+
feat = feat[:, :, prompt_feat.shape[1]:]
|
|
135
|
+
return feat
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import torch
|
|
15
|
+
import torch.nn.functional as F
|
|
16
|
+
from matcha.models.components.flow_matching import BASECFM
|
|
17
|
+
|
|
18
|
+
class ConditionalCFM(BASECFM):
|
|
19
|
+
def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
|
|
20
|
+
super().__init__(
|
|
21
|
+
n_feats=in_channels,
|
|
22
|
+
cfm_params=cfm_params,
|
|
23
|
+
n_spks=n_spks,
|
|
24
|
+
spk_emb_dim=spk_emb_dim,
|
|
25
|
+
)
|
|
26
|
+
self.t_scheduler = cfm_params.t_scheduler
|
|
27
|
+
self.training_cfg_rate = cfm_params.training_cfg_rate
|
|
28
|
+
self.inference_cfg_rate = cfm_params.inference_cfg_rate
|
|
29
|
+
in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
|
|
30
|
+
# Just change the architecture of the estimator here
|
|
31
|
+
self.estimator = estimator
|
|
32
|
+
|
|
33
|
+
@torch.inference_mode()
|
|
34
|
+
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
|
|
35
|
+
"""Forward diffusion
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
mu (torch.Tensor): output of encoder
|
|
39
|
+
shape: (batch_size, n_feats, mel_timesteps)
|
|
40
|
+
mask (torch.Tensor): output_mask
|
|
41
|
+
shape: (batch_size, 1, mel_timesteps)
|
|
42
|
+
n_timesteps (int): number of diffusion steps
|
|
43
|
+
temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
|
|
44
|
+
spks (torch.Tensor, optional): speaker ids. Defaults to None.
|
|
45
|
+
shape: (batch_size, spk_emb_dim)
|
|
46
|
+
cond: Not used but kept for future purposes
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
sample: generated mel-spectrogram
|
|
50
|
+
shape: (batch_size, n_feats, mel_timesteps)
|
|
51
|
+
"""
|
|
52
|
+
z = torch.randn_like(mu) * temperature
|
|
53
|
+
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
|
|
54
|
+
if self.t_scheduler == 'cosine':
|
|
55
|
+
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
|
|
56
|
+
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
|
|
57
|
+
|
|
58
|
+
def solve_euler(self, x, t_span, mu, mask, spks, cond):
|
|
59
|
+
"""
|
|
60
|
+
Fixed euler solver for ODEs.
|
|
61
|
+
Args:
|
|
62
|
+
x (torch.Tensor): random noise
|
|
63
|
+
t_span (torch.Tensor): n_timesteps interpolated
|
|
64
|
+
shape: (n_timesteps + 1,)
|
|
65
|
+
mu (torch.Tensor): output of encoder
|
|
66
|
+
shape: (batch_size, n_feats, mel_timesteps)
|
|
67
|
+
mask (torch.Tensor): output_mask
|
|
68
|
+
shape: (batch_size, 1, mel_timesteps)
|
|
69
|
+
spks (torch.Tensor, optional): speaker ids. Defaults to None.
|
|
70
|
+
shape: (batch_size, spk_emb_dim)
|
|
71
|
+
cond: Not used but kept for future purposes
|
|
72
|
+
"""
|
|
73
|
+
t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
|
|
74
|
+
|
|
75
|
+
# I am storing this because I can later plot it by putting a debugger here and saving it to a file
|
|
76
|
+
# Or in future might add like a return_all_steps flag
|
|
77
|
+
sol = []
|
|
78
|
+
|
|
79
|
+
for step in range(1, len(t_span)):
|
|
80
|
+
dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
|
|
81
|
+
# Classifier-Free Guidance inference introduced in VoiceBox
|
|
82
|
+
if self.inference_cfg_rate > 0:
|
|
83
|
+
cfg_dphi_dt = self.estimator(
|
|
84
|
+
x, mask,
|
|
85
|
+
torch.zeros_like(mu), t,
|
|
86
|
+
torch.zeros_like(spks) if spks is not None else None,
|
|
87
|
+
torch.zeros_like(cond)
|
|
88
|
+
)
|
|
89
|
+
dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
|
|
90
|
+
self.inference_cfg_rate * cfg_dphi_dt)
|
|
91
|
+
x = x + dt * dphi_dt
|
|
92
|
+
t = t + dt
|
|
93
|
+
sol.append(x)
|
|
94
|
+
if step < len(t_span) - 1:
|
|
95
|
+
dt = t_span[step + 1] - t
|
|
96
|
+
|
|
97
|
+
return sol[-1]
|
|
98
|
+
|
|
99
|
+
def compute_loss(self, x1, mask, mu, spks=None, cond=None):
|
|
100
|
+
"""Computes diffusion loss
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
x1 (torch.Tensor): Target
|
|
104
|
+
shape: (batch_size, n_feats, mel_timesteps)
|
|
105
|
+
mask (torch.Tensor): target mask
|
|
106
|
+
shape: (batch_size, 1, mel_timesteps)
|
|
107
|
+
mu (torch.Tensor): output of encoder
|
|
108
|
+
shape: (batch_size, n_feats, mel_timesteps)
|
|
109
|
+
spks (torch.Tensor, optional): speaker embedding. Defaults to None.
|
|
110
|
+
shape: (batch_size, spk_emb_dim)
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
loss: conditional flow matching loss
|
|
114
|
+
y: conditional flow
|
|
115
|
+
shape: (batch_size, n_feats, mel_timesteps)
|
|
116
|
+
"""
|
|
117
|
+
b, _, t = mu.shape
|
|
118
|
+
|
|
119
|
+
# random timestep
|
|
120
|
+
t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
|
|
121
|
+
if self.t_scheduler == 'cosine':
|
|
122
|
+
t = 1 - torch.cos(t * 0.5 * torch.pi)
|
|
123
|
+
# sample noise p(x_0)
|
|
124
|
+
z = torch.randn_like(x1)
|
|
125
|
+
|
|
126
|
+
y = (1 - (1 - self.sigma_min) * t) * z + t * x1
|
|
127
|
+
u = x1 - (1 - self.sigma_min) * z
|
|
128
|
+
|
|
129
|
+
# during training, we randomly drop condition to trade off mode coverage and sample fidelity
|
|
130
|
+
if self.training_cfg_rate > 0:
|
|
131
|
+
cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
|
|
132
|
+
mu = mu * cfg_mask.view(-1, 1, 1)
|
|
133
|
+
spks = spks * cfg_mask.view(-1, 1)
|
|
134
|
+
cond = cond * cfg_mask.view(-1, 1, 1)
|
|
135
|
+
|
|
136
|
+
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
|
|
137
|
+
loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
|
|
138
|
+
return loss, y
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Tuple
|
|
15
|
+
import torch.nn as nn
|
|
16
|
+
from torch.nn import functional as F
|
|
17
|
+
from cosyvoice.utils.mask import make_pad_mask
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InterpolateRegulator(nn.Module):
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
channels: int,
|
|
24
|
+
sampling_ratios: Tuple,
|
|
25
|
+
out_channels: int = None,
|
|
26
|
+
groups: int = 1,
|
|
27
|
+
):
|
|
28
|
+
super().__init__()
|
|
29
|
+
self.sampling_ratios = sampling_ratios
|
|
30
|
+
out_channels = out_channels or channels
|
|
31
|
+
model = nn.ModuleList([])
|
|
32
|
+
if len(sampling_ratios) > 0:
|
|
33
|
+
for _ in sampling_ratios:
|
|
34
|
+
module = nn.Conv1d(channels, channels, 3, 1, 1)
|
|
35
|
+
norm = nn.GroupNorm(groups, channels)
|
|
36
|
+
act = nn.Mish()
|
|
37
|
+
model.extend([module, norm, act])
|
|
38
|
+
model.append(
|
|
39
|
+
nn.Conv1d(channels, out_channels, 1, 1)
|
|
40
|
+
)
|
|
41
|
+
self.model = nn.Sequential(*model)
|
|
42
|
+
|
|
43
|
+
def forward(self, x, ylens=None):
|
|
44
|
+
# x in (B, T, D)
|
|
45
|
+
mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
|
|
46
|
+
x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
|
|
47
|
+
out = self.model(x).transpose(1, 2).contiguous()
|
|
48
|
+
olens = ylens
|
|
49
|
+
return out * mask, olens
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import torch
|
|
15
|
+
import torch.nn as nn
|
|
16
|
+
from torch.nn.utils import weight_norm
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ConvRNNF0Predictor(nn.Module):
|
|
20
|
+
def __init__(self,
|
|
21
|
+
num_class: int = 1,
|
|
22
|
+
in_channels: int = 80,
|
|
23
|
+
cond_channels: int = 512
|
|
24
|
+
):
|
|
25
|
+
super().__init__()
|
|
26
|
+
|
|
27
|
+
self.num_class = num_class
|
|
28
|
+
self.condnet = nn.Sequential(
|
|
29
|
+
weight_norm(
|
|
30
|
+
nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
|
|
31
|
+
),
|
|
32
|
+
nn.ELU(),
|
|
33
|
+
weight_norm(
|
|
34
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
35
|
+
),
|
|
36
|
+
nn.ELU(),
|
|
37
|
+
weight_norm(
|
|
38
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
39
|
+
),
|
|
40
|
+
nn.ELU(),
|
|
41
|
+
weight_norm(
|
|
42
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
43
|
+
),
|
|
44
|
+
nn.ELU(),
|
|
45
|
+
weight_norm(
|
|
46
|
+
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
|
|
47
|
+
),
|
|
48
|
+
nn.ELU(),
|
|
49
|
+
)
|
|
50
|
+
self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
|
|
51
|
+
|
|
52
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
53
|
+
x = self.condnet(x)
|
|
54
|
+
x = x.transpose(1, 2)
|
|
55
|
+
return torch.abs(self.classifier(x).squeeze(-1))
|