xinference 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show
  1. xinference/__init__.py +0 -1
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +30 -5
  4. xinference/client/restful/restful_client.py +18 -3
  5. xinference/constants.py +0 -4
  6. xinference/core/chat_interface.py +2 -2
  7. xinference/core/image_interface.py +6 -3
  8. xinference/core/model.py +9 -4
  9. xinference/core/scheduler.py +4 -4
  10. xinference/core/supervisor.py +2 -0
  11. xinference/core/worker.py +7 -0
  12. xinference/deploy/utils.py +6 -0
  13. xinference/model/audio/core.py +9 -4
  14. xinference/model/audio/cosyvoice.py +136 -0
  15. xinference/model/audio/model_spec.json +24 -0
  16. xinference/model/audio/model_spec_modelscope.json +27 -0
  17. xinference/model/core.py +25 -4
  18. xinference/model/embedding/core.py +88 -13
  19. xinference/model/embedding/model_spec.json +8 -0
  20. xinference/model/embedding/model_spec_modelscope.json +8 -0
  21. xinference/model/flexible/core.py +8 -2
  22. xinference/model/flexible/launchers/__init__.py +1 -0
  23. xinference/model/flexible/launchers/image_process_launcher.py +70 -0
  24. xinference/model/image/core.py +8 -5
  25. xinference/model/image/model_spec.json +36 -5
  26. xinference/model/image/model_spec_modelscope.json +21 -3
  27. xinference/model/image/stable_diffusion/core.py +36 -28
  28. xinference/model/llm/core.py +6 -4
  29. xinference/model/llm/ggml/llamacpp.py +7 -5
  30. xinference/model/llm/llm_family.json +802 -82
  31. xinference/model/llm/llm_family.py +6 -6
  32. xinference/model/llm/llm_family_csghub.json +39 -0
  33. xinference/model/llm/llm_family_modelscope.json +295 -47
  34. xinference/model/llm/mlx/core.py +7 -0
  35. xinference/model/llm/pytorch/chatglm.py +246 -5
  36. xinference/model/llm/pytorch/cogvlm2.py +1 -1
  37. xinference/model/llm/pytorch/deepseek_vl.py +2 -1
  38. xinference/model/llm/pytorch/falcon.py +2 -1
  39. xinference/model/llm/pytorch/llama_2.py +4 -2
  40. xinference/model/llm/pytorch/omnilmm.py +2 -1
  41. xinference/model/llm/pytorch/qwen_vl.py +2 -1
  42. xinference/model/llm/pytorch/vicuna.py +2 -1
  43. xinference/model/llm/pytorch/yi_vl.py +2 -1
  44. xinference/model/llm/sglang/core.py +12 -6
  45. xinference/model/llm/utils.py +78 -1
  46. xinference/model/llm/vllm/core.py +9 -5
  47. xinference/model/rerank/core.py +4 -3
  48. xinference/thirdparty/cosyvoice/__init__.py +0 -0
  49. xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
  50. xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
  51. xinference/thirdparty/cosyvoice/bin/train.py +136 -0
  52. xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
  53. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
  54. xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
  55. xinference/thirdparty/cosyvoice/cli/model.py +60 -0
  56. xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
  57. xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
  58. xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
  59. xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
  60. xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
  61. xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
  62. xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
  63. xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
  64. xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
  65. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
  66. xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
  67. xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
  68. xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
  69. xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
  70. xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
  71. xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
  72. xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
  73. xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
  74. xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
  75. xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
  76. xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
  77. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
  78. xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
  79. xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  80. xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
  81. xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
  82. xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
  83. xinference/thirdparty/cosyvoice/utils/common.py +103 -0
  84. xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
  85. xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
  86. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
  87. xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
  88. xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
  89. xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
  90. xinference/web/ui/build/asset-manifest.json +3 -3
  91. xinference/web/ui/build/index.html +1 -1
  92. xinference/web/ui/build/static/js/{main.95c1d652.js → main.af906659.js} +3 -3
  93. xinference/web/ui/build/static/js/main.af906659.js.map +1 -0
  94. xinference/web/ui/node_modules/.cache/babel-loader/2cd5e4279ad7e13a1f41d486e9fca7756295bfad5bd77d90992f4ac3e10b496d.json +1 -0
  95. {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/METADATA +39 -11
  96. {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/RECORD +101 -57
  97. xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
  98. xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
  99. /xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.af906659.js.LICENSE.txt} +0 -0
  100. {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/LICENSE +0 -0
  101. {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/WHEEL +0 -0
  102. {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/entry_points.txt +0 -0
  103. {xinference-0.13.2.dist-info → xinference-0.13.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,222 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import torch
15
+ import torch.nn as nn
16
+ from einops import pack, rearrange, repeat
17
+ from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
18
+ from matcha.models.components.transformer import BasicTransformerBlock
19
+
20
+
21
+ class ConditionalDecoder(nn.Module):
22
+ def __init__(
23
+ self,
24
+ in_channels,
25
+ out_channels,
26
+ channels=(256, 256),
27
+ dropout=0.05,
28
+ attention_head_dim=64,
29
+ n_blocks=1,
30
+ num_mid_blocks=2,
31
+ num_heads=4,
32
+ act_fn="snake",
33
+ ):
34
+ """
35
+ This decoder requires an input with the same shape of the target. So, if your text content
36
+ is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
37
+ """
38
+ super().__init__()
39
+ channels = tuple(channels)
40
+ self.in_channels = in_channels
41
+ self.out_channels = out_channels
42
+
43
+ self.time_embeddings = SinusoidalPosEmb(in_channels)
44
+ time_embed_dim = channels[0] * 4
45
+ self.time_mlp = TimestepEmbedding(
46
+ in_channels=in_channels,
47
+ time_embed_dim=time_embed_dim,
48
+ act_fn="silu",
49
+ )
50
+ self.down_blocks = nn.ModuleList([])
51
+ self.mid_blocks = nn.ModuleList([])
52
+ self.up_blocks = nn.ModuleList([])
53
+
54
+ output_channel = in_channels
55
+ for i in range(len(channels)): # pylint: disable=consider-using-enumerate
56
+ input_channel = output_channel
57
+ output_channel = channels[i]
58
+ is_last = i == len(channels) - 1
59
+ resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
60
+ transformer_blocks = nn.ModuleList(
61
+ [
62
+ BasicTransformerBlock(
63
+ dim=output_channel,
64
+ num_attention_heads=num_heads,
65
+ attention_head_dim=attention_head_dim,
66
+ dropout=dropout,
67
+ activation_fn=act_fn,
68
+ )
69
+ for _ in range(n_blocks)
70
+ ]
71
+ )
72
+ downsample = (
73
+ Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
74
+ )
75
+ self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
76
+
77
+ for i in range(num_mid_blocks):
78
+ input_channel = channels[-1]
79
+ out_channels = channels[-1]
80
+ resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
81
+
82
+ transformer_blocks = nn.ModuleList(
83
+ [
84
+ BasicTransformerBlock(
85
+ dim=output_channel,
86
+ num_attention_heads=num_heads,
87
+ attention_head_dim=attention_head_dim,
88
+ dropout=dropout,
89
+ activation_fn=act_fn,
90
+ )
91
+ for _ in range(n_blocks)
92
+ ]
93
+ )
94
+
95
+ self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
96
+
97
+ channels = channels[::-1] + (channels[0],)
98
+ for i in range(len(channels) - 1):
99
+ input_channel = channels[i] * 2
100
+ output_channel = channels[i + 1]
101
+ is_last = i == len(channels) - 2
102
+ resnet = ResnetBlock1D(
103
+ dim=input_channel,
104
+ dim_out=output_channel,
105
+ time_emb_dim=time_embed_dim,
106
+ )
107
+ transformer_blocks = nn.ModuleList(
108
+ [
109
+ BasicTransformerBlock(
110
+ dim=output_channel,
111
+ num_attention_heads=num_heads,
112
+ attention_head_dim=attention_head_dim,
113
+ dropout=dropout,
114
+ activation_fn=act_fn,
115
+ )
116
+ for _ in range(n_blocks)
117
+ ]
118
+ )
119
+ upsample = (
120
+ Upsample1D(output_channel, use_conv_transpose=True)
121
+ if not is_last
122
+ else nn.Conv1d(output_channel, output_channel, 3, padding=1)
123
+ )
124
+ self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
125
+ self.final_block = Block1D(channels[-1], channels[-1])
126
+ self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
127
+ self.initialize_weights()
128
+
129
+
130
+ def initialize_weights(self):
131
+ for m in self.modules():
132
+ if isinstance(m, nn.Conv1d):
133
+ nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
134
+ if m.bias is not None:
135
+ nn.init.constant_(m.bias, 0)
136
+ elif isinstance(m, nn.GroupNorm):
137
+ nn.init.constant_(m.weight, 1)
138
+ nn.init.constant_(m.bias, 0)
139
+ elif isinstance(m, nn.Linear):
140
+ nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
141
+ if m.bias is not None:
142
+ nn.init.constant_(m.bias, 0)
143
+
144
+ def forward(self, x, mask, mu, t, spks=None, cond=None):
145
+ """Forward pass of the UNet1DConditional model.
146
+
147
+ Args:
148
+ x (torch.Tensor): shape (batch_size, in_channels, time)
149
+ mask (_type_): shape (batch_size, 1, time)
150
+ t (_type_): shape (batch_size)
151
+ spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
152
+ cond (_type_, optional): placeholder for future use. Defaults to None.
153
+
154
+ Raises:
155
+ ValueError: _description_
156
+ ValueError: _description_
157
+
158
+ Returns:
159
+ _type_: _description_
160
+ """
161
+
162
+ t = self.time_embeddings(t)
163
+ t = self.time_mlp(t)
164
+
165
+ x = pack([x, mu], "b * t")[0]
166
+
167
+ if spks is not None:
168
+ spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
169
+ x = pack([x, spks], "b * t")[0]
170
+ if cond is not None:
171
+ x = pack([x, cond], "b * t")[0]
172
+
173
+ hiddens = []
174
+ masks = [mask]
175
+ for resnet, transformer_blocks, downsample in self.down_blocks:
176
+ mask_down = masks[-1]
177
+ x = resnet(x, mask_down, t)
178
+ x = rearrange(x, "b c t -> b t c").contiguous()
179
+ attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
180
+ for transformer_block in transformer_blocks:
181
+ x = transformer_block(
182
+ hidden_states=x,
183
+ attention_mask=attn_mask,
184
+ timestep=t,
185
+ )
186
+ x = rearrange(x, "b t c -> b c t").contiguous()
187
+ hiddens.append(x) # Save hidden states for skip connections
188
+ x = downsample(x * mask_down)
189
+ masks.append(mask_down[:, :, ::2])
190
+ masks = masks[:-1]
191
+ mask_mid = masks[-1]
192
+
193
+ for resnet, transformer_blocks in self.mid_blocks:
194
+ x = resnet(x, mask_mid, t)
195
+ x = rearrange(x, "b c t -> b t c").contiguous()
196
+ attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
197
+ for transformer_block in transformer_blocks:
198
+ x = transformer_block(
199
+ hidden_states=x,
200
+ attention_mask=attn_mask,
201
+ timestep=t,
202
+ )
203
+ x = rearrange(x, "b t c -> b c t").contiguous()
204
+
205
+ for resnet, transformer_blocks, upsample in self.up_blocks:
206
+ mask_up = masks.pop()
207
+ skip = hiddens.pop()
208
+ x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
209
+ x = resnet(x, mask_up, t)
210
+ x = rearrange(x, "b c t -> b t c").contiguous()
211
+ attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
212
+ for transformer_block in transformer_blocks:
213
+ x = transformer_block(
214
+ hidden_states=x,
215
+ attention_mask=attn_mask,
216
+ timestep=t,
217
+ )
218
+ x = rearrange(x, "b t c -> b c t").contiguous()
219
+ x = upsample(x * mask_up)
220
+ x = self.final_block(x, mask_up)
221
+ output = self.final_proj(x * mask_up)
222
+ return output * mask
@@ -0,0 +1,135 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from typing import Dict, Optional
16
+ import torch
17
+ import torch.nn as nn
18
+ from torch.nn import functional as F
19
+ from omegaconf import DictConfig
20
+ from cosyvoice.utils.mask import make_pad_mask
21
+
22
+
23
+ class MaskedDiffWithXvec(torch.nn.Module):
24
+ def __init__(self,
25
+ input_size: int = 512,
26
+ output_size: int = 80,
27
+ spk_embed_dim: int = 192,
28
+ output_type: str = "mel",
29
+ vocab_size: int = 4096,
30
+ input_frame_rate: int = 50,
31
+ only_mask_loss: bool = True,
32
+ encoder: torch.nn.Module = None,
33
+ length_regulator: torch.nn.Module = None,
34
+ decoder: torch.nn.Module = None,
35
+ decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
36
+ mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
37
+ super().__init__()
38
+ self.input_size = input_size
39
+ self.output_size = output_size
40
+ self.decoder_conf = decoder_conf
41
+ self.mel_feat_conf = mel_feat_conf
42
+ self.vocab_size = vocab_size
43
+ self.output_type = output_type
44
+ self.input_frame_rate = input_frame_rate
45
+ logging.info(f"input frame rate={self.input_frame_rate}")
46
+ self.input_embedding = nn.Embedding(vocab_size, input_size)
47
+ self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
48
+ self.encoder = encoder
49
+ self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
50
+ self.decoder = decoder
51
+ self.length_regulator = length_regulator
52
+ self.only_mask_loss = only_mask_loss
53
+
54
+ def forward(
55
+ self,
56
+ batch: dict,
57
+ device: torch.device,
58
+ ) -> Dict[str, Optional[torch.Tensor]]:
59
+ token = batch['speech_token'].to(device)
60
+ token_len = batch['speech_token_len'].to(device)
61
+ feat = batch['speech_feat'].to(device)
62
+ feat_len = batch['speech_feat_len'].to(device)
63
+ embedding = batch['embedding'].to(device)
64
+
65
+ # xvec projection
66
+ embedding = F.normalize(embedding, dim=1)
67
+ embedding = self.spk_embed_affine_layer(embedding)
68
+
69
+ # concat text and prompt_text
70
+ mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
71
+ token = self.input_embedding(torch.clamp(token, min=0)) * mask
72
+
73
+ # text encode
74
+ h, h_lengths = self.encoder(token, token_len)
75
+ h = self.encoder_proj(h)
76
+ h, h_lengths = self.length_regulator(h, feat_len)
77
+
78
+ # get conditions
79
+ conds = torch.zeros(feat.shape, device=token.device)
80
+ conds = conds.transpose(1, 2)
81
+
82
+ mask = (~make_pad_mask(feat_len)).to(h)
83
+ feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
84
+ loss, _ = self.decoder.compute_loss(
85
+ feat.transpose(1, 2).contiguous(),
86
+ mask.unsqueeze(1),
87
+ h.transpose(1, 2).contiguous(),
88
+ embedding,
89
+ cond=conds
90
+ )
91
+ return {'loss': loss}
92
+
93
+ @torch.inference_mode()
94
+ def inference(self,
95
+ token,
96
+ token_len,
97
+ prompt_token,
98
+ prompt_token_len,
99
+ prompt_feat,
100
+ prompt_feat_len,
101
+ embedding):
102
+ assert token.shape[0] == 1
103
+ # xvec projection
104
+ embedding = F.normalize(embedding, dim=1)
105
+ embedding = self.spk_embed_affine_layer(embedding)
106
+
107
+ # concat text and prompt_text
108
+ token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
109
+ mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
110
+ token = self.input_embedding(torch.clamp(token, min=0)) * mask
111
+
112
+ # text encode
113
+ h, h_lengths = self.encoder(token, token_len)
114
+ h = self.encoder_proj(h)
115
+ feat_len = (token_len / 50 * 22050 / 256).int()
116
+ h, h_lengths = self.length_regulator(h, feat_len)
117
+
118
+ # get conditions
119
+ conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
120
+ if prompt_feat.shape[1] != 0:
121
+ for i, j in enumerate(prompt_feat_len):
122
+ conds[i, :j] = prompt_feat[i]
123
+ conds = conds.transpose(1, 2)
124
+
125
+ mask = (~make_pad_mask(feat_len)).to(h)
126
+ feat = self.decoder(
127
+ mu=h.transpose(1, 2).contiguous(),
128
+ mask=mask.unsqueeze(1),
129
+ spks=embedding,
130
+ cond=conds,
131
+ n_timesteps=10
132
+ )
133
+ if prompt_feat.shape[1] != 0:
134
+ feat = feat[:, :, prompt_feat.shape[1]:]
135
+ return feat
@@ -0,0 +1,138 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import torch
15
+ import torch.nn.functional as F
16
+ from matcha.models.components.flow_matching import BASECFM
17
+
18
+ class ConditionalCFM(BASECFM):
19
+ def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
20
+ super().__init__(
21
+ n_feats=in_channels,
22
+ cfm_params=cfm_params,
23
+ n_spks=n_spks,
24
+ spk_emb_dim=spk_emb_dim,
25
+ )
26
+ self.t_scheduler = cfm_params.t_scheduler
27
+ self.training_cfg_rate = cfm_params.training_cfg_rate
28
+ self.inference_cfg_rate = cfm_params.inference_cfg_rate
29
+ in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
30
+ # Just change the architecture of the estimator here
31
+ self.estimator = estimator
32
+
33
+ @torch.inference_mode()
34
+ def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
35
+ """Forward diffusion
36
+
37
+ Args:
38
+ mu (torch.Tensor): output of encoder
39
+ shape: (batch_size, n_feats, mel_timesteps)
40
+ mask (torch.Tensor): output_mask
41
+ shape: (batch_size, 1, mel_timesteps)
42
+ n_timesteps (int): number of diffusion steps
43
+ temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
44
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
45
+ shape: (batch_size, spk_emb_dim)
46
+ cond: Not used but kept for future purposes
47
+
48
+ Returns:
49
+ sample: generated mel-spectrogram
50
+ shape: (batch_size, n_feats, mel_timesteps)
51
+ """
52
+ z = torch.randn_like(mu) * temperature
53
+ t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
54
+ if self.t_scheduler == 'cosine':
55
+ t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
56
+ return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
57
+
58
+ def solve_euler(self, x, t_span, mu, mask, spks, cond):
59
+ """
60
+ Fixed euler solver for ODEs.
61
+ Args:
62
+ x (torch.Tensor): random noise
63
+ t_span (torch.Tensor): n_timesteps interpolated
64
+ shape: (n_timesteps + 1,)
65
+ mu (torch.Tensor): output of encoder
66
+ shape: (batch_size, n_feats, mel_timesteps)
67
+ mask (torch.Tensor): output_mask
68
+ shape: (batch_size, 1, mel_timesteps)
69
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
70
+ shape: (batch_size, spk_emb_dim)
71
+ cond: Not used but kept for future purposes
72
+ """
73
+ t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
74
+
75
+ # I am storing this because I can later plot it by putting a debugger here and saving it to a file
76
+ # Or in future might add like a return_all_steps flag
77
+ sol = []
78
+
79
+ for step in range(1, len(t_span)):
80
+ dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
81
+ # Classifier-Free Guidance inference introduced in VoiceBox
82
+ if self.inference_cfg_rate > 0:
83
+ cfg_dphi_dt = self.estimator(
84
+ x, mask,
85
+ torch.zeros_like(mu), t,
86
+ torch.zeros_like(spks) if spks is not None else None,
87
+ torch.zeros_like(cond)
88
+ )
89
+ dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
90
+ self.inference_cfg_rate * cfg_dphi_dt)
91
+ x = x + dt * dphi_dt
92
+ t = t + dt
93
+ sol.append(x)
94
+ if step < len(t_span) - 1:
95
+ dt = t_span[step + 1] - t
96
+
97
+ return sol[-1]
98
+
99
+ def compute_loss(self, x1, mask, mu, spks=None, cond=None):
100
+ """Computes diffusion loss
101
+
102
+ Args:
103
+ x1 (torch.Tensor): Target
104
+ shape: (batch_size, n_feats, mel_timesteps)
105
+ mask (torch.Tensor): target mask
106
+ shape: (batch_size, 1, mel_timesteps)
107
+ mu (torch.Tensor): output of encoder
108
+ shape: (batch_size, n_feats, mel_timesteps)
109
+ spks (torch.Tensor, optional): speaker embedding. Defaults to None.
110
+ shape: (batch_size, spk_emb_dim)
111
+
112
+ Returns:
113
+ loss: conditional flow matching loss
114
+ y: conditional flow
115
+ shape: (batch_size, n_feats, mel_timesteps)
116
+ """
117
+ b, _, t = mu.shape
118
+
119
+ # random timestep
120
+ t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
121
+ if self.t_scheduler == 'cosine':
122
+ t = 1 - torch.cos(t * 0.5 * torch.pi)
123
+ # sample noise p(x_0)
124
+ z = torch.randn_like(x1)
125
+
126
+ y = (1 - (1 - self.sigma_min) * t) * z + t * x1
127
+ u = x1 - (1 - self.sigma_min) * z
128
+
129
+ # during training, we randomly drop condition to trade off mode coverage and sample fidelity
130
+ if self.training_cfg_rate > 0:
131
+ cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
132
+ mu = mu * cfg_mask.view(-1, 1, 1)
133
+ spks = spks * cfg_mask.view(-1, 1)
134
+ cond = cond * cfg_mask.view(-1, 1, 1)
135
+
136
+ pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
137
+ loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
138
+ return loss, y
@@ -0,0 +1,49 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Tuple
15
+ import torch.nn as nn
16
+ from torch.nn import functional as F
17
+ from cosyvoice.utils.mask import make_pad_mask
18
+
19
+
20
+ class InterpolateRegulator(nn.Module):
21
+ def __init__(
22
+ self,
23
+ channels: int,
24
+ sampling_ratios: Tuple,
25
+ out_channels: int = None,
26
+ groups: int = 1,
27
+ ):
28
+ super().__init__()
29
+ self.sampling_ratios = sampling_ratios
30
+ out_channels = out_channels or channels
31
+ model = nn.ModuleList([])
32
+ if len(sampling_ratios) > 0:
33
+ for _ in sampling_ratios:
34
+ module = nn.Conv1d(channels, channels, 3, 1, 1)
35
+ norm = nn.GroupNorm(groups, channels)
36
+ act = nn.Mish()
37
+ model.extend([module, norm, act])
38
+ model.append(
39
+ nn.Conv1d(channels, out_channels, 1, 1)
40
+ )
41
+ self.model = nn.Sequential(*model)
42
+
43
+ def forward(self, x, ylens=None):
44
+ # x in (B, T, D)
45
+ mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
46
+ x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
47
+ out = self.model(x).transpose(1, 2).contiguous()
48
+ olens = ylens
49
+ return out * mask, olens
File without changes
@@ -0,0 +1,55 @@
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.nn.utils import weight_norm
17
+
18
+
19
+ class ConvRNNF0Predictor(nn.Module):
20
+ def __init__(self,
21
+ num_class: int = 1,
22
+ in_channels: int = 80,
23
+ cond_channels: int = 512
24
+ ):
25
+ super().__init__()
26
+
27
+ self.num_class = num_class
28
+ self.condnet = nn.Sequential(
29
+ weight_norm(
30
+ nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
31
+ ),
32
+ nn.ELU(),
33
+ weight_norm(
34
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
35
+ ),
36
+ nn.ELU(),
37
+ weight_norm(
38
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
39
+ ),
40
+ nn.ELU(),
41
+ weight_norm(
42
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
43
+ ),
44
+ nn.ELU(),
45
+ weight_norm(
46
+ nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
47
+ ),
48
+ nn.ELU(),
49
+ )
50
+ self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
51
+
52
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
53
+ x = self.condnet(x)
54
+ x = x.transpose(1, 2)
55
+ return torch.abs(self.classifier(x).squeeze(-1))