xinference 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +5 -5
  3. xinference/core/model.py +6 -1
  4. xinference/deploy/cmdline.py +3 -1
  5. xinference/deploy/test/test_cmdline.py +56 -0
  6. xinference/isolation.py +24 -0
  7. xinference/model/audio/core.py +5 -0
  8. xinference/model/audio/f5tts.py +195 -0
  9. xinference/model/audio/fish_speech.py +2 -1
  10. xinference/model/audio/model_spec.json +8 -0
  11. xinference/model/audio/model_spec_modelscope.json +9 -0
  12. xinference/model/embedding/core.py +203 -142
  13. xinference/model/embedding/model_spec.json +7 -0
  14. xinference/model/embedding/model_spec_modelscope.json +8 -0
  15. xinference/model/llm/__init__.py +2 -2
  16. xinference/model/llm/llm_family.json +172 -53
  17. xinference/model/llm/llm_family_modelscope.json +118 -20
  18. xinference/model/llm/mlx/core.py +230 -49
  19. xinference/model/llm/sglang/core.py +1 -0
  20. xinference/model/llm/transformers/chatglm.py +9 -5
  21. xinference/model/llm/transformers/utils.py +16 -8
  22. xinference/model/llm/utils.py +4 -1
  23. xinference/model/llm/vllm/core.py +5 -0
  24. xinference/thirdparty/f5_tts/__init__.py +0 -0
  25. xinference/thirdparty/f5_tts/api.py +166 -0
  26. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  27. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  28. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  29. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  30. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  31. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  32. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  33. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  34. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  35. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  36. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  37. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  38. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  39. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  40. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  41. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  42. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  43. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  44. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  45. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  46. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  47. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  48. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  49. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  50. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  51. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  52. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  53. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  54. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  55. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  56. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  57. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  58. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  59. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  60. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  61. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  62. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  63. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  64. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  65. xinference/thirdparty/f5_tts/train/README.md +77 -0
  66. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  67. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  68. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  69. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  70. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  71. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  72. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  73. xinference/thirdparty/f5_tts/train/train.py +75 -0
  74. xinference/web/ui/build/asset-manifest.json +3 -3
  75. xinference/web/ui/build/index.html +1 -1
  76. xinference/web/ui/build/static/js/{main.2f269bb3.js → main.4eb4ee80.js} +3 -3
  77. xinference/web/ui/build/static/js/main.4eb4ee80.js.map +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +1 -0
  79. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/METADATA +33 -14
  80. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/RECORD +85 -34
  81. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  83. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.4eb4ee80.js.LICENSE.txt} +0 -0
  84. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/LICENSE +0 -0
  85. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/WHEEL +0 -0
  86. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/entry_points.txt +0 -0
  87. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,330 @@
1
+ # just for speaker similarity evaluation, third-party code
2
+
3
+ # From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
4
+ # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
5
+
6
+ import os
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+
12
+ """ Res2Conv1d + BatchNorm1d + ReLU
13
+ """
14
+
15
+
16
+ class Res2Conv1dReluBn(nn.Module):
17
+ """
18
+ in_channels == out_channels == channels
19
+ """
20
+
21
+ def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
22
+ super().__init__()
23
+ assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
24
+ self.scale = scale
25
+ self.width = channels // scale
26
+ self.nums = scale if scale == 1 else scale - 1
27
+
28
+ self.convs = []
29
+ self.bns = []
30
+ for i in range(self.nums):
31
+ self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
32
+ self.bns.append(nn.BatchNorm1d(self.width))
33
+ self.convs = nn.ModuleList(self.convs)
34
+ self.bns = nn.ModuleList(self.bns)
35
+
36
+ def forward(self, x):
37
+ out = []
38
+ spx = torch.split(x, self.width, 1)
39
+ for i in range(self.nums):
40
+ if i == 0:
41
+ sp = spx[i]
42
+ else:
43
+ sp = sp + spx[i]
44
+ # Order: conv -> relu -> bn
45
+ sp = self.convs[i](sp)
46
+ sp = self.bns[i](F.relu(sp))
47
+ out.append(sp)
48
+ if self.scale != 1:
49
+ out.append(spx[self.nums])
50
+ out = torch.cat(out, dim=1)
51
+
52
+ return out
53
+
54
+
55
+ """ Conv1d + BatchNorm1d + ReLU
56
+ """
57
+
58
+
59
+ class Conv1dReluBn(nn.Module):
60
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
61
+ super().__init__()
62
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
63
+ self.bn = nn.BatchNorm1d(out_channels)
64
+
65
+ def forward(self, x):
66
+ return self.bn(F.relu(self.conv(x)))
67
+
68
+
69
+ """ The SE connection of 1D case.
70
+ """
71
+
72
+
73
+ class SE_Connect(nn.Module):
74
+ def __init__(self, channels, se_bottleneck_dim=128):
75
+ super().__init__()
76
+ self.linear1 = nn.Linear(channels, se_bottleneck_dim)
77
+ self.linear2 = nn.Linear(se_bottleneck_dim, channels)
78
+
79
+ def forward(self, x):
80
+ out = x.mean(dim=2)
81
+ out = F.relu(self.linear1(out))
82
+ out = torch.sigmoid(self.linear2(out))
83
+ out = x * out.unsqueeze(2)
84
+
85
+ return out
86
+
87
+
88
+ """ SE-Res2Block of the ECAPA-TDNN architecture.
89
+ """
90
+
91
+ # def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
92
+ # return nn.Sequential(
93
+ # Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
94
+ # Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
95
+ # Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
96
+ # SE_Connect(channels)
97
+ # )
98
+
99
+
100
+ class SE_Res2Block(nn.Module):
101
+ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
102
+ super().__init__()
103
+ self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
104
+ self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
105
+ self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
106
+ self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
107
+
108
+ self.shortcut = None
109
+ if in_channels != out_channels:
110
+ self.shortcut = nn.Conv1d(
111
+ in_channels=in_channels,
112
+ out_channels=out_channels,
113
+ kernel_size=1,
114
+ )
115
+
116
+ def forward(self, x):
117
+ residual = x
118
+ if self.shortcut:
119
+ residual = self.shortcut(x)
120
+
121
+ x = self.Conv1dReluBn1(x)
122
+ x = self.Res2Conv1dReluBn(x)
123
+ x = self.Conv1dReluBn2(x)
124
+ x = self.SE_Connect(x)
125
+
126
+ return x + residual
127
+
128
+
129
+ """ Attentive weighted mean and standard deviation pooling.
130
+ """
131
+
132
+
133
+ class AttentiveStatsPool(nn.Module):
134
+ def __init__(self, in_dim, attention_channels=128, global_context_att=False):
135
+ super().__init__()
136
+ self.global_context_att = global_context_att
137
+
138
+ # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
139
+ if global_context_att:
140
+ self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
141
+ else:
142
+ self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
143
+ self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
144
+
145
+ def forward(self, x):
146
+ if self.global_context_att:
147
+ context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
148
+ context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
149
+ x_in = torch.cat((x, context_mean, context_std), dim=1)
150
+ else:
151
+ x_in = x
152
+
153
+ # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
154
+ alpha = torch.tanh(self.linear1(x_in))
155
+ # alpha = F.relu(self.linear1(x_in))
156
+ alpha = torch.softmax(self.linear2(alpha), dim=2)
157
+ mean = torch.sum(alpha * x, dim=2)
158
+ residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
159
+ std = torch.sqrt(residuals.clamp(min=1e-9))
160
+ return torch.cat([mean, std], dim=1)
161
+
162
+
163
+ class ECAPA_TDNN(nn.Module):
164
+ def __init__(
165
+ self,
166
+ feat_dim=80,
167
+ channels=512,
168
+ emb_dim=192,
169
+ global_context_att=False,
170
+ feat_type="wavlm_large",
171
+ sr=16000,
172
+ feature_selection="hidden_states",
173
+ update_extract=False,
174
+ config_path=None,
175
+ ):
176
+ super().__init__()
177
+
178
+ self.feat_type = feat_type
179
+ self.feature_selection = feature_selection
180
+ self.update_extract = update_extract
181
+ self.sr = sr
182
+
183
+ torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
184
+ try:
185
+ local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
186
+ self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
187
+ except: # noqa: E722
188
+ self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
189
+
190
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
191
+ self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
192
+ ):
193
+ self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
194
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
195
+ self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
196
+ ):
197
+ self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
198
+
199
+ self.feat_num = self.get_feat_num()
200
+ self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
201
+
202
+ if feat_type != "fbank" and feat_type != "mfcc":
203
+ freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
204
+ for name, param in self.feature_extract.named_parameters():
205
+ for freeze_val in freeze_list:
206
+ if freeze_val in name:
207
+ param.requires_grad = False
208
+ break
209
+
210
+ if not self.update_extract:
211
+ for param in self.feature_extract.parameters():
212
+ param.requires_grad = False
213
+
214
+ self.instance_norm = nn.InstanceNorm1d(feat_dim)
215
+ # self.channels = [channels] * 4 + [channels * 3]
216
+ self.channels = [channels] * 4 + [1536]
217
+
218
+ self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
219
+ self.layer2 = SE_Res2Block(
220
+ self.channels[0],
221
+ self.channels[1],
222
+ kernel_size=3,
223
+ stride=1,
224
+ padding=2,
225
+ dilation=2,
226
+ scale=8,
227
+ se_bottleneck_dim=128,
228
+ )
229
+ self.layer3 = SE_Res2Block(
230
+ self.channels[1],
231
+ self.channels[2],
232
+ kernel_size=3,
233
+ stride=1,
234
+ padding=3,
235
+ dilation=3,
236
+ scale=8,
237
+ se_bottleneck_dim=128,
238
+ )
239
+ self.layer4 = SE_Res2Block(
240
+ self.channels[2],
241
+ self.channels[3],
242
+ kernel_size=3,
243
+ stride=1,
244
+ padding=4,
245
+ dilation=4,
246
+ scale=8,
247
+ se_bottleneck_dim=128,
248
+ )
249
+
250
+ # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
251
+ cat_channels = channels * 3
252
+ self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
253
+ self.pooling = AttentiveStatsPool(
254
+ self.channels[-1], attention_channels=128, global_context_att=global_context_att
255
+ )
256
+ self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
257
+ self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
258
+
259
+ def get_feat_num(self):
260
+ self.feature_extract.eval()
261
+ wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
262
+ with torch.no_grad():
263
+ features = self.feature_extract(wav)
264
+ select_feature = features[self.feature_selection]
265
+ if isinstance(select_feature, (list, tuple)):
266
+ return len(select_feature)
267
+ else:
268
+ return 1
269
+
270
+ def get_feat(self, x):
271
+ if self.update_extract:
272
+ x = self.feature_extract([sample for sample in x])
273
+ else:
274
+ with torch.no_grad():
275
+ if self.feat_type == "fbank" or self.feat_type == "mfcc":
276
+ x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
277
+ else:
278
+ x = self.feature_extract([sample for sample in x])
279
+
280
+ if self.feat_type == "fbank":
281
+ x = x.log()
282
+
283
+ if self.feat_type != "fbank" and self.feat_type != "mfcc":
284
+ x = x[self.feature_selection]
285
+ if isinstance(x, (list, tuple)):
286
+ x = torch.stack(x, dim=0)
287
+ else:
288
+ x = x.unsqueeze(0)
289
+ norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
290
+ x = (norm_weights * x).sum(dim=0)
291
+ x = torch.transpose(x, 1, 2) + 1e-6
292
+
293
+ x = self.instance_norm(x)
294
+ return x
295
+
296
+ def forward(self, x):
297
+ x = self.get_feat(x)
298
+
299
+ out1 = self.layer1(x)
300
+ out2 = self.layer2(out1)
301
+ out3 = self.layer3(out2)
302
+ out4 = self.layer4(out3)
303
+
304
+ out = torch.cat([out2, out3, out4], dim=1)
305
+ out = F.relu(self.conv(out))
306
+ out = self.bn(self.pooling(out))
307
+ out = self.linear(out)
308
+
309
+ return out
310
+
311
+
312
+ def ECAPA_TDNN_SMALL(
313
+ feat_dim,
314
+ emb_dim=256,
315
+ feat_type="wavlm_large",
316
+ sr=16000,
317
+ feature_selection="hidden_states",
318
+ update_extract=False,
319
+ config_path=None,
320
+ ):
321
+ return ECAPA_TDNN(
322
+ feat_dim=feat_dim,
323
+ channels=512,
324
+ emb_dim=emb_dim,
325
+ feat_type=feat_type,
326
+ sr=sr,
327
+ feature_selection=feature_selection,
328
+ update_extract=update_extract,
329
+ config_path=config_path,
330
+ )
@@ -0,0 +1,207 @@
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import argparse
7
+ import time
8
+ from importlib.resources import files
9
+
10
+ import torch
11
+ import torchaudio
12
+ from accelerate import Accelerator
13
+ from tqdm import tqdm
14
+
15
+ from f5_tts.eval.utils_eval import (
16
+ get_inference_prompt,
17
+ get_librispeech_test_clean_metainfo,
18
+ get_seedtts_testset_metainfo,
19
+ )
20
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
21
+ from f5_tts.model import CFM, DiT, UNetT
22
+ from f5_tts.model.utils import get_tokenizer
23
+
24
+ accelerator = Accelerator()
25
+ device = f"cuda:{accelerator.process_index}"
26
+
27
+
28
+ # --------------------- Dataset Settings -------------------- #
29
+
30
+ target_sample_rate = 24000
31
+ n_mel_channels = 100
32
+ hop_length = 256
33
+ win_length = 1024
34
+ n_fft = 1024
35
+ target_rms = 0.1
36
+
37
+ rel_path = str(files("f5_tts").joinpath("../../"))
38
+
39
+
40
+ def main():
41
+ # ---------------------- infer setting ---------------------- #
42
+
43
+ parser = argparse.ArgumentParser(description="batch inference")
44
+
45
+ parser.add_argument("-s", "--seed", default=None, type=int)
46
+ parser.add_argument("-d", "--dataset", default="Emilia_ZH_EN")
47
+ parser.add_argument("-n", "--expname", required=True)
48
+ parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
49
+ parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
50
+ parser.add_argument("-to", "--tokenizer", default="pinyin", type=str, choices=["pinyin", "char"])
51
+
52
+ parser.add_argument("-nfe", "--nfestep", default=32, type=int)
53
+ parser.add_argument("-o", "--odemethod", default="euler")
54
+ parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
55
+
56
+ parser.add_argument("-t", "--testset", required=True)
57
+
58
+ args = parser.parse_args()
59
+
60
+ seed = args.seed
61
+ dataset_name = args.dataset
62
+ exp_name = args.expname
63
+ ckpt_step = args.ckptstep
64
+ ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
65
+ mel_spec_type = args.mel_spec_type
66
+ tokenizer = args.tokenizer
67
+
68
+ nfe_step = args.nfestep
69
+ ode_method = args.odemethod
70
+ sway_sampling_coef = args.swaysampling
71
+
72
+ testset = args.testset
73
+
74
+ infer_batch_size = 1 # max frames. 1 for ddp single inference (recommended)
75
+ cfg_strength = 2.0
76
+ speed = 1.0
77
+ use_truth_duration = False
78
+ no_ref_audio = False
79
+
80
+ if exp_name == "F5TTS_Base":
81
+ model_cls = DiT
82
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
83
+
84
+ elif exp_name == "E2TTS_Base":
85
+ model_cls = UNetT
86
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
87
+
88
+ if testset == "ls_pc_test_clean":
89
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
90
+ librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
91
+ metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
92
+
93
+ elif testset == "seedtts_test_zh":
94
+ metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
95
+ metainfo = get_seedtts_testset_metainfo(metalst)
96
+
97
+ elif testset == "seedtts_test_en":
98
+ metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
99
+ metainfo = get_seedtts_testset_metainfo(metalst)
100
+
101
+ # path to save genereted wavs
102
+ output_dir = (
103
+ f"{rel_path}/"
104
+ f"results/{exp_name}_{ckpt_step}/{testset}/"
105
+ f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
106
+ f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
107
+ f"_cfg{cfg_strength}_speed{speed}"
108
+ f"{'_gt-dur' if use_truth_duration else ''}"
109
+ f"{'_no-ref-audio' if no_ref_audio else ''}"
110
+ )
111
+
112
+ # -------------------------------------------------#
113
+
114
+ use_ema = True
115
+
116
+ prompts_all = get_inference_prompt(
117
+ metainfo,
118
+ speed=speed,
119
+ tokenizer=tokenizer,
120
+ target_sample_rate=target_sample_rate,
121
+ n_mel_channels=n_mel_channels,
122
+ hop_length=hop_length,
123
+ mel_spec_type=mel_spec_type,
124
+ target_rms=target_rms,
125
+ use_truth_duration=use_truth_duration,
126
+ infer_batch_size=infer_batch_size,
127
+ )
128
+
129
+ # Vocoder model
130
+ local = False
131
+ if mel_spec_type == "vocos":
132
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
133
+ elif mel_spec_type == "bigvgan":
134
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
135
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
136
+
137
+ # Tokenizer
138
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
139
+
140
+ # Model
141
+ model = CFM(
142
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
143
+ mel_spec_kwargs=dict(
144
+ n_fft=n_fft,
145
+ hop_length=hop_length,
146
+ win_length=win_length,
147
+ n_mel_channels=n_mel_channels,
148
+ target_sample_rate=target_sample_rate,
149
+ mel_spec_type=mel_spec_type,
150
+ ),
151
+ odeint_kwargs=dict(
152
+ method=ode_method,
153
+ ),
154
+ vocab_char_map=vocab_char_map,
155
+ ).to(device)
156
+
157
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
158
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
159
+
160
+ if not os.path.exists(output_dir) and accelerator.is_main_process:
161
+ os.makedirs(output_dir)
162
+
163
+ # start batch inference
164
+ accelerator.wait_for_everyone()
165
+ start = time.time()
166
+
167
+ with accelerator.split_between_processes(prompts_all) as prompts:
168
+ for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
169
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
170
+ ref_mels = ref_mels.to(device)
171
+ ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
172
+ total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
173
+
174
+ # Inference
175
+ with torch.inference_mode():
176
+ generated, _ = model.sample(
177
+ cond=ref_mels,
178
+ text=final_text_list,
179
+ duration=total_mel_lens,
180
+ lens=ref_mel_lens,
181
+ steps=nfe_step,
182
+ cfg_strength=cfg_strength,
183
+ sway_sampling_coef=sway_sampling_coef,
184
+ no_ref_audio=no_ref_audio,
185
+ seed=seed,
186
+ )
187
+ # Final result
188
+ for i, gen in enumerate(generated):
189
+ gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
190
+ gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
191
+ if mel_spec_type == "vocos":
192
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
193
+ elif mel_spec_type == "bigvgan":
194
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
195
+
196
+ if ref_rms_list[i] < target_rms:
197
+ generated_wave = generated_wave * ref_rms_list[i] / target_rms
198
+ torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
199
+
200
+ accelerator.wait_for_everyone()
201
+ if accelerator.is_main_process:
202
+ timediff = time.time() - start
203
+ print(f"Done batch inference in {timediff / 60 :.2f} minutes.")
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()
@@ -0,0 +1,13 @@
1
+ #!/bin/bash
2
+
3
+ # e.g. F5-TTS, 16 NFE
4
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_zh" -nfe 16
5
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_en" -nfe 16
6
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "ls_pc_test_clean" -nfe 16
7
+
8
+ # e.g. Vanilla E2 TTS, 32 NFE
9
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_zh" -o "midpoint" -ss 0
10
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_en" -o "midpoint" -ss 0
11
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "ls_pc_test_clean" -o "midpoint" -ss 0
12
+
13
+ # etc.
@@ -0,0 +1,84 @@
1
+ # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
2
+
3
+ import sys
4
+ import os
5
+ import argparse
6
+
7
+ sys.path.append(os.getcwd())
8
+
9
+ import multiprocessing as mp
10
+ from importlib.resources import files
11
+
12
+ import numpy as np
13
+
14
+ from f5_tts.eval.utils_eval import (
15
+ get_librispeech_test,
16
+ run_asr_wer,
17
+ run_sim,
18
+ )
19
+
20
+ rel_path = str(files("f5_tts").joinpath("../../"))
21
+
22
+
23
+ def get_args():
24
+ parser = argparse.ArgumentParser()
25
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
26
+ parser.add_argument("-l", "--lang", type=str, default="en")
27
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
28
+ parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
29
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
30
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
31
+ return parser.parse_args()
32
+
33
+
34
+ def main():
35
+ args = get_args()
36
+ eval_task = args.eval_task
37
+ lang = args.lang
38
+ librispeech_test_clean_path = args.librispeech_test_clean_path # test-clean path
39
+ gen_wav_dir = args.gen_wav_dir
40
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
41
+
42
+ gpus = list(range(args.gpu_nums))
43
+ test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
44
+
45
+ ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
46
+ ## leading to a low similarity for the ground truth in some cases.
47
+ # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
48
+
49
+ local = args.local
50
+ if local: # use local custom checkpoint dir
51
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
52
+ else:
53
+ asr_ckpt_dir = "" # auto download to cache dir
54
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
55
+
56
+ # --------------------------- WER ---------------------------
57
+ if eval_task == "wer":
58
+ wers = []
59
+ with mp.Pool(processes=len(gpus)) as pool:
60
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
61
+ results = pool.map(run_asr_wer, args)
62
+ for wers_ in results:
63
+ wers.extend(wers_)
64
+
65
+ wer = round(np.mean(wers) * 100, 3)
66
+ print(f"\nTotal {len(wers)} samples")
67
+ print(f"WER : {wer}%")
68
+
69
+ # --------------------------- SIM ---------------------------
70
+ if eval_task == "sim":
71
+ sim_list = []
72
+ with mp.Pool(processes=len(gpus)) as pool:
73
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
74
+ results = pool.map(run_sim, args)
75
+ for sim_ in results:
76
+ sim_list.extend(sim_)
77
+
78
+ sim = round(sum(sim_list) / len(sim_list), 3)
79
+ print(f"\nTotal {len(sim_list)} samples")
80
+ print(f"SIM : {sim}")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()