xinference 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +97 -8
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/core/media_interface.py +758 -0
  5. xinference/core/model.py +49 -9
  6. xinference/core/worker.py +31 -37
  7. xinference/deploy/utils.py +0 -3
  8. xinference/model/audio/__init__.py +16 -27
  9. xinference/model/audio/core.py +1 -0
  10. xinference/model/audio/cosyvoice.py +4 -2
  11. xinference/model/audio/model_spec.json +20 -3
  12. xinference/model/audio/model_spec_modelscope.json +18 -1
  13. xinference/model/embedding/__init__.py +16 -24
  14. xinference/model/image/__init__.py +15 -25
  15. xinference/model/llm/__init__.py +37 -110
  16. xinference/model/llm/core.py +15 -6
  17. xinference/model/llm/llama_cpp/core.py +25 -353
  18. xinference/model/llm/llm_family.json +613 -89
  19. xinference/model/llm/llm_family.py +9 -1
  20. xinference/model/llm/llm_family_modelscope.json +540 -90
  21. xinference/model/llm/mlx/core.py +6 -3
  22. xinference/model/llm/reasoning_parser.py +281 -5
  23. xinference/model/llm/sglang/core.py +16 -3
  24. xinference/model/llm/transformers/chatglm.py +2 -2
  25. xinference/model/llm/transformers/cogagent.py +1 -1
  26. xinference/model/llm/transformers/cogvlm2.py +1 -1
  27. xinference/model/llm/transformers/core.py +9 -3
  28. xinference/model/llm/transformers/glm4v.py +1 -1
  29. xinference/model/llm/transformers/minicpmv26.py +1 -1
  30. xinference/model/llm/transformers/qwen-omni.py +6 -0
  31. xinference/model/llm/transformers/qwen_vl.py +1 -1
  32. xinference/model/llm/utils.py +68 -45
  33. xinference/model/llm/vllm/core.py +38 -18
  34. xinference/model/llm/vllm/xavier/test/test_xavier.py +1 -10
  35. xinference/model/rerank/__init__.py +13 -24
  36. xinference/model/video/__init__.py +15 -25
  37. xinference/model/video/core.py +3 -3
  38. xinference/model/video/diffusers.py +133 -16
  39. xinference/model/video/model_spec.json +54 -0
  40. xinference/model/video/model_spec_modelscope.json +56 -0
  41. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  42. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  43. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  44. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  45. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  46. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  47. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  48. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  49. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  50. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  51. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  52. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  53. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  54. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  55. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  56. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  57. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  58. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  59. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  60. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  61. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  62. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  63. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  64. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  65. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  66. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  67. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  68. xinference/types.py +0 -71
  69. xinference/web/ui/build/asset-manifest.json +3 -3
  70. xinference/web/ui/build/index.html +1 -1
  71. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  72. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  79. xinference/web/ui/src/locales/en.json +6 -4
  80. xinference/web/ui/src/locales/zh.json +6 -4
  81. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
  82. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/RECORD +87 -87
  83. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
  84. xinference/core/image_interface.py +0 -377
  85. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  86. xinference/web/ui/build/static/js/main.91e77b5c.js +0 -3
  87. xinference/web/ui/build/static/js/main.91e77b5c.js.map +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/5e6edb0fb87e3798f142e9abf8dd2dc46bab33a60d31dff525797c0c99887097.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/6087820be1bd5c02c42dff797e7df365448ef35ab26dd5d6bd33e967e05cbfd4.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.91e77b5c.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  94. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
  95. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
  96. {xinference-1.5.1.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import os
15
+ from typing import Generator
14
16
  import torch
15
17
  import numpy as np
16
18
  import threading
@@ -19,6 +21,7 @@ from torch.nn import functional as F
19
21
  from contextlib import nullcontext
20
22
  import uuid
21
23
  from cosyvoice.utils.common import fade_in_out
24
+ from cosyvoice.utils.file_utils import convert_onnx_to_trt
22
25
 
23
26
 
24
27
  class CosyVoiceModel:
@@ -27,17 +30,18 @@ class CosyVoiceModel:
27
30
  llm: torch.nn.Module,
28
31
  flow: torch.nn.Module,
29
32
  hift: torch.nn.Module,
30
- fp16: bool):
33
+ fp16: bool = False):
31
34
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
35
  self.llm = llm
33
36
  self.flow = flow
34
37
  self.hift = hift
35
38
  self.fp16 = fp16
39
+ if self.fp16 is True:
40
+ self.llm.half()
41
+ self.flow.half()
36
42
  self.token_min_hop_len = 2 * self.flow.input_frame_rate
37
43
  self.token_max_hop_len = 4 * self.flow.input_frame_rate
38
44
  self.token_overlap_len = 20
39
- # here we fix set flow.decoder.estimator.static_chunk_size = 0 for compatibability
40
- self.flow.decoder.estimator.static_chunk_size = 0
41
45
  # mel fade in out
42
46
  self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
43
47
  self.mel_window = np.hamming(2 * self.mel_overlap_len)
@@ -61,8 +65,6 @@ class CosyVoiceModel:
61
65
  def load(self, llm_model, flow_model, hift_model):
62
66
  self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
63
67
  self.llm.to(self.device).eval()
64
- if self.fp16 is True:
65
- self.llm.half()
66
68
  self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
67
69
  self.flow.to(self.device).eval()
68
70
  # in case hift_model is a hifigan model
@@ -71,7 +73,6 @@ class CosyVoiceModel:
71
73
  self.hift.to(self.device).eval()
72
74
 
73
75
  def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
74
- assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
75
76
  llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
76
77
  self.llm.text_encoder = llm_text_encoder
77
78
  llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
@@ -79,39 +80,62 @@ class CosyVoiceModel:
79
80
  flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
80
81
  self.flow.encoder = flow_encoder
81
82
 
82
- def load_onnx(self, flow_decoder_estimator_model):
83
- import onnxruntime
84
- option = onnxruntime.SessionOptions()
85
- option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
86
- option.intra_op_num_threads = 1
87
- providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
83
+ def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
84
+ assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
85
+ if not os.path.exists(flow_decoder_estimator_model):
86
+ convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
87
+ if os.path.getsize(flow_decoder_estimator_model) == 0:
88
+ raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
88
89
  del self.flow.decoder.estimator
89
- self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
90
+ import tensorrt as trt
91
+ with open(flow_decoder_estimator_model, 'rb') as f:
92
+ self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
93
+ assert self.flow.decoder.estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
94
+ self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
95
+
96
+ def get_trt_kwargs(self):
97
+ min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
98
+ opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200)]
99
+ max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
100
+ input_names = ["x", "mask", "mu", "cond"]
101
+ return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
90
102
 
91
103
  def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
92
- if self.fp16 is True:
93
- llm_embedding = llm_embedding.half()
94
- with self.llm_context:
95
- for i in self.llm.inference(text=text.to(self.device),
96
- text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
97
- prompt_text=prompt_text.to(self.device),
98
- prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
99
- prompt_speech_token=llm_prompt_speech_token.to(self.device),
100
- prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
101
- embedding=llm_embedding.to(self.device)):
102
- self.tts_speech_token_dict[uuid].append(i)
104
+ with self.llm_context, torch.cuda.amp.autocast(self.fp16):
105
+ if isinstance(text, Generator):
106
+ assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
107
+ for i in self.llm.inference_bistream(text=text,
108
+ prompt_text=prompt_text.to(self.device),
109
+ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
110
+ prompt_speech_token=llm_prompt_speech_token.to(self.device),
111
+ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
112
+ embedding=llm_embedding.to(self.device)):
113
+ self.tts_speech_token_dict[uuid].append(i)
114
+ else:
115
+ for i in self.llm.inference(text=text.to(self.device),
116
+ text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
117
+ prompt_text=prompt_text.to(self.device),
118
+ prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
119
+ prompt_speech_token=llm_prompt_speech_token.to(self.device),
120
+ prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
121
+ embedding=llm_embedding.to(self.device)):
122
+ self.tts_speech_token_dict[uuid].append(i)
123
+ self.llm_end_dict[uuid] = True
124
+
125
+ def vc_job(self, source_speech_token, uuid):
126
+ self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
103
127
  self.llm_end_dict[uuid] = True
104
128
 
105
129
  def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
106
- tts_mel, flow_cache = self.flow.inference(token=token.to(self.device),
107
- token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
108
- prompt_token=prompt_token.to(self.device),
109
- prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
110
- prompt_feat=prompt_feat.to(self.device),
111
- prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
112
- embedding=embedding.to(self.device),
113
- flow_cache=self.flow_cache_dict[uuid])
114
- self.flow_cache_dict[uuid] = flow_cache
130
+ with torch.cuda.amp.autocast(self.fp16):
131
+ tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
132
+ token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
133
+ prompt_token=prompt_token.to(self.device),
134
+ prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
135
+ prompt_feat=prompt_feat.to(self.device),
136
+ prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
137
+ embedding=embedding.to(self.device),
138
+ flow_cache=self.flow_cache_dict[uuid])
115
139
 
116
140
  # mel overlap fade in out
117
141
  if self.mel_overlap_dict[uuid].shape[2] != 0:
@@ -142,11 +166,11 @@ class CosyVoiceModel:
142
166
  tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
143
167
  return tts_speech
144
168
 
145
- def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
169
+ def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
146
170
  prompt_text=torch.zeros(1, 0, dtype=torch.int32),
147
171
  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
148
172
  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
149
- prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
173
+ prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
150
174
  # this_uuid is used to track variables related to this inference thread
151
175
  this_uuid = str(uuid.uuid1())
152
176
  with self.lock:
@@ -154,7 +178,10 @@ class CosyVoiceModel:
154
178
  self.hift_cache_dict[this_uuid] = None
155
179
  self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
156
180
  self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
157
- p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
181
+ if source_speech_token.shape[1] == 0:
182
+ p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
183
+ else:
184
+ p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
158
185
  p.start()
159
186
  if stream is True:
160
187
  token_hop_len = self.token_min_hop_len
@@ -204,143 +231,89 @@ class CosyVoiceModel:
204
231
  self.mel_overlap_dict.pop(this_uuid)
205
232
  self.hift_cache_dict.pop(this_uuid)
206
233
  self.flow_cache_dict.pop(this_uuid)
234
+ torch.cuda.empty_cache()
207
235
 
208
- def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
209
- # this_uuid is used to track variables related to this inference thread
210
- this_uuid = str(uuid.uuid1())
211
- with self.lock:
212
- self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
213
- self.hift_cache_dict[this_uuid] = None
214
- self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
215
- self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
216
- if stream is True:
217
- token_hop_len = self.token_min_hop_len
218
- while True:
219
- if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
220
- this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
221
- .unsqueeze(dim=0)
222
- this_tts_speech = self.token2wav(token=this_tts_speech_token,
223
- prompt_token=flow_prompt_speech_token,
224
- prompt_feat=prompt_speech_feat,
225
- embedding=flow_embedding,
226
- uuid=this_uuid,
227
- finalize=False)
228
- yield {'tts_speech': this_tts_speech.cpu()}
229
- with self.lock:
230
- self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
231
- # increase token_hop_len for better speech quality
232
- token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
233
- if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
234
- break
235
- # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
236
- this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
237
- this_tts_speech = self.token2wav(token=this_tts_speech_token,
238
- prompt_token=flow_prompt_speech_token,
239
- prompt_feat=prompt_speech_feat,
240
- embedding=flow_embedding,
241
- uuid=this_uuid,
242
- finalize=True)
243
- yield {'tts_speech': this_tts_speech.cpu()}
244
- else:
245
- # deal with all tokens
246
- this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
247
- this_tts_speech = self.token2wav(token=this_tts_speech_token,
248
- prompt_token=flow_prompt_speech_token,
249
- prompt_feat=prompt_speech_feat,
250
- embedding=flow_embedding,
251
- uuid=this_uuid,
252
- finalize=True,
253
- speed=speed)
254
- yield {'tts_speech': this_tts_speech.cpu()}
255
- with self.lock:
256
- self.tts_speech_token_dict.pop(this_uuid)
257
- self.llm_end_dict.pop(this_uuid)
258
- self.mel_overlap_dict.pop(this_uuid)
259
- self.hift_cache_dict.pop(this_uuid)
260
236
 
261
-
262
- class CosyVoice2Model:
237
+ class CosyVoice2Model(CosyVoiceModel):
263
238
 
264
239
  def __init__(self,
265
240
  llm: torch.nn.Module,
266
241
  flow: torch.nn.Module,
267
- hift: torch.nn.Module):
242
+ hift: torch.nn.Module,
243
+ fp16: bool = False,
244
+ use_flow_cache: bool = False):
268
245
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
269
246
  self.llm = llm
270
247
  self.flow = flow
271
248
  self.hift = hift
272
- self.token_hop_len = 2 * self.flow.input_frame_rate
273
- # here we fix flow encoder/decoder decoding_chunk_size, in the future we will send it as arguments, or use cache
274
- self.flow.encoder.static_chunk_size = 2 * self.flow.input_frame_rate
275
- self.flow.decoder.estimator.static_chunk_size = 2 * self.flow.input_frame_rate * self.flow.token_mel_ratio
249
+ self.fp16 = fp16
250
+ self.use_flow_cache = use_flow_cache
251
+ if self.fp16 is True:
252
+ self.llm.half()
253
+ self.flow.half()
254
+ # stream related params, check examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
255
+ self.token_hop_len = 25
256
+ self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
276
257
  # hift cache
277
258
  self.mel_cache_len = 8
278
259
  self.source_cache_len = int(self.mel_cache_len * 480)
279
260
  # speech fade in out
280
261
  self.speech_window = np.hamming(2 * self.source_cache_len)
281
262
  # rtf and decoding related
282
- self.stream_scale_factor = 1
283
263
  self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
284
264
  self.lock = threading.Lock()
285
265
  # dict used to store session related variable
286
266
  self.tts_speech_token_dict = {}
287
267
  self.llm_end_dict = {}
268
+ self.flow_cache_dict = {}
288
269
  self.hift_cache_dict = {}
289
270
 
290
- def load(self, llm_model, flow_model, hift_model):
291
- self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
292
- self.llm.to(self.device).eval()
293
- self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
294
- self.flow.to(self.device).eval()
295
- self.flow.decoder.fp16 = False
296
- # in case hift_model is a hifigan model
297
- hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
298
- self.hift.load_state_dict(hift_state_dict, strict=True)
299
- self.hift.to(self.device).eval()
271
+ def init_flow_cache(self):
272
+ encoder_cache = {'offset': 0,
273
+ 'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
274
+ 'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
275
+ 'upsample_offset': 0,
276
+ 'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
277
+ 'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
278
+ decoder_cache = {'offset': 0,
279
+ 'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
280
+ 'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
281
+ 'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
282
+ 'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
283
+ 'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
284
+ 'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
285
+ 'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
286
+ if self.fp16 is True:
287
+ for cache in [encoder_cache, decoder_cache]:
288
+ for k, v in cache.items():
289
+ if isinstance(v, torch.Tensor):
290
+ cache[k] = v.half()
291
+ cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
292
+ return cache
300
293
 
301
294
  def load_jit(self, flow_encoder_model):
302
295
  flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
303
296
  self.flow.encoder = flow_encoder
304
297
 
305
- def load_onnx(self, flow_decoder_estimator_model):
306
- import onnxruntime
307
- option = onnxruntime.SessionOptions()
308
- option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
309
- option.intra_op_num_threads = 1
310
- providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
311
- del self.flow.decoder.estimator
312
- self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
298
+ def get_trt_kwargs(self):
299
+ min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4), (1, 4, 2, 0, 512, 2), (12, 4, 2, 0, 512, 2), (1, 4, 2, 0, 512, 2)]
300
+ opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200), (1, 4, 2, 100, 512, 2), (12, 4, 2, 100, 512, 2), (1, 4, 2, 100, 512, 2)]
301
+ max_shape = [(2, 80, 1500), (2, 1, 1500), (2, 80, 1500), (2, 80, 1500), (1, 4, 2, 200, 512, 2), (12, 4, 2, 200, 512, 2), (1, 4, 2, 200, 512, 2)]
302
+ input_names = ["x", "mask", "mu", "cond", 'down_blocks_kv_cache', 'mid_blocks_kv_cache', 'up_blocks_kv_cache']
303
+ assert self.use_flow_cache is True, "get_trt_kwargs is set for flow cache mode. If you want to use trt with use_flow_cache=False, please set higher max_shape"
304
+ return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
313
305
 
314
- def load_trt(self, flow_decoder_estimator_model):
315
- del self.flow.decoder.estimator
316
- import tensorrt as trt
317
- with open(flow_decoder_estimator_model, 'rb') as f:
318
- self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
319
- self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
320
- self.flow.decoder.fp16 = True
321
-
322
- def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
323
- with self.llm_context:
324
- for i in self.llm.inference(text=text.to(self.device),
325
- text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
326
- prompt_text=prompt_text.to(self.device),
327
- prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
328
- prompt_speech_token=llm_prompt_speech_token.to(self.device),
329
- prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
330
- embedding=llm_embedding.to(self.device)):
331
- self.tts_speech_token_dict[uuid].append(i)
332
- self.llm_end_dict[uuid] = True
333
-
334
- def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
335
- tts_mel, _ = self.flow.inference(token=token.to(self.device),
336
- token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
337
- prompt_token=prompt_token.to(self.device),
338
- prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
339
- prompt_feat=prompt_feat.to(self.device),
340
- prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
341
- embedding=embedding.to(self.device),
342
- finalize=finalize)
343
- tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
306
+ def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
307
+ with torch.cuda.amp.autocast(self.fp16):
308
+ tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
309
+ token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
310
+ prompt_token=prompt_token.to(self.device),
311
+ prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
312
+ prompt_feat=prompt_feat.to(self.device),
313
+ prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
314
+ embedding=embedding.to(self.device),
315
+ cache=self.flow_cache_dict[uuid],
316
+ finalize=finalize)
344
317
  # append hift cache
345
318
  if self.hift_cache_dict[uuid] is not None:
346
319
  hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
@@ -365,34 +338,44 @@ class CosyVoice2Model:
365
338
  tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
366
339
  return tts_speech
367
340
 
368
- def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
341
+ def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
369
342
  prompt_text=torch.zeros(1, 0, dtype=torch.int32),
370
343
  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
371
344
  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
372
- prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
345
+ prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
373
346
  # this_uuid is used to track variables related to this inference thread
374
347
  this_uuid = str(uuid.uuid1())
375
348
  with self.lock:
376
349
  self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
377
350
  self.hift_cache_dict[this_uuid] = None
378
- p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
351
+ self.flow_cache_dict[this_uuid] = self.init_flow_cache()
352
+ if source_speech_token.shape[1] == 0:
353
+ p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
354
+ else:
355
+ p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
379
356
  p.start()
380
357
  if stream is True:
381
- token_offset = 0
358
+ assert self.use_flow_cache is True, "set use_flow_cache=True if you want to use stream inference to avoid OOM"
359
+ # NOTE in cache mode, trim flow_prompt to same size as flow_decoder_required_cache_size
360
+ flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
361
+ prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
382
362
  while True:
383
363
  time.sleep(0.1)
384
- if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
385
- this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
364
+ if len(self.tts_speech_token_dict[this_uuid]) >= self.token_hop_len + self.flow.pre_lookahead_len:
365
+ this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
386
366
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
387
367
  prompt_token=flow_prompt_speech_token,
388
368
  prompt_feat=prompt_speech_feat,
389
369
  embedding=flow_embedding,
390
370
  uuid=this_uuid,
391
- token_offset=token_offset,
392
371
  finalize=False)
393
- token_offset += self.token_hop_len
372
+ # NOTE in cache inference mode, we only use flow_prompt_speech_token/prompt_speech_feat in first chunk
373
+ flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
374
+ prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
394
375
  yield {'tts_speech': this_tts_speech.cpu()}
395
- if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len:
376
+ with self.lock:
377
+ self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
378
+ if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
396
379
  break
397
380
  p.join()
398
381
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
@@ -402,11 +385,11 @@ class CosyVoice2Model:
402
385
  prompt_feat=prompt_speech_feat,
403
386
  embedding=flow_embedding,
404
387
  uuid=this_uuid,
405
- token_offset=token_offset,
406
388
  finalize=True)
407
389
  yield {'tts_speech': this_tts_speech.cpu()}
408
390
  else:
409
391
  # deal with all tokens
392
+ assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
410
393
  p.join()
411
394
  this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
412
395
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
@@ -414,10 +397,12 @@ class CosyVoice2Model:
414
397
  prompt_feat=prompt_speech_feat,
415
398
  embedding=flow_embedding,
416
399
  uuid=this_uuid,
417
- token_offset=0,
418
400
  finalize=True,
419
401
  speed=speed)
420
402
  yield {'tts_speech': this_tts_speech.cpu()}
421
403
  with self.lock:
422
404
  self.tts_speech_token_dict.pop(this_uuid)
423
405
  self.llm_end_dict.pop(this_uuid)
406
+ self.hift_cache_dict.pop(this_uuid)
407
+ self.flow_cache_dict.pop(this_uuid)
408
+ torch.cuda.empty_cache()
@@ -20,8 +20,8 @@ import torch
20
20
  import torchaudio
21
21
  from torch.nn.utils.rnn import pad_sequence
22
22
  import torch.nn.functional as F
23
+ import pyworld as pw
23
24
 
24
- torchaudio.set_audio_backend('soundfile')
25
25
 
26
26
  AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
27
27
 
@@ -179,7 +179,7 @@ def compute_fbank(data,
179
179
  yield sample
180
180
 
181
181
 
182
- def compute_f0(data, pitch_extractor, mode='train'):
182
+ def compute_f0(data, sample_rate, hop_size, mode='train'):
183
183
  """ Extract f0
184
184
 
185
185
  Args:
@@ -188,15 +188,19 @@ def compute_f0(data, pitch_extractor, mode='train'):
188
188
  Returns:
189
189
  Iterable[{key, feat, label}]
190
190
  """
191
+ frame_period = hop_size * 1000 / sample_rate
191
192
  for sample in data:
192
193
  assert 'sample_rate' in sample
193
194
  assert 'speech' in sample
194
195
  assert 'utt' in sample
195
196
  assert 'text_token' in sample
196
197
  waveform = sample['speech']
197
- mat = pitch_extractor(waveform).transpose(1, 2)
198
- mat = F.interpolate(mat, size=sample['speech_feat'].shape[0], mode='linear')
199
- sample['pitch_feat'] = mat[0, 0]
198
+ _f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
199
+ if sum(_f0 != 0) < 5: # this happens when the algorithm fails
200
+ _f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
201
+ f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
202
+ f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
203
+ sample['pitch_feat'] = f0
200
204
  yield sample
201
205
 
202
206