xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +107 -11
- xinference/client/restful/restful_client.py +51 -11
- xinference/constants.py +5 -1
- xinference/core/media_interface.py +758 -0
- xinference/core/model.py +49 -9
- xinference/core/supervisor.py +1 -1
- xinference/core/utils.py +1 -1
- xinference/core/worker.py +33 -39
- xinference/deploy/cmdline.py +17 -0
- xinference/deploy/utils.py +0 -3
- xinference/model/audio/__init__.py +16 -27
- xinference/model/audio/core.py +2 -1
- xinference/model/audio/cosyvoice.py +4 -2
- xinference/model/audio/model_spec.json +63 -46
- xinference/model/audio/model_spec_modelscope.json +31 -14
- xinference/model/embedding/__init__.py +16 -24
- xinference/model/image/__init__.py +15 -25
- xinference/model/llm/__init__.py +40 -115
- xinference/model/llm/core.py +29 -6
- xinference/model/llm/llama_cpp/core.py +30 -347
- xinference/model/llm/llm_family.json +1674 -2203
- xinference/model/llm/llm_family.py +71 -7
- xinference/model/llm/llm_family_csghub.json +0 -32
- xinference/model/llm/llm_family_modelscope.json +1838 -2016
- xinference/model/llm/llm_family_openmind_hub.json +19 -325
- xinference/model/llm/lmdeploy/core.py +7 -2
- xinference/model/llm/mlx/core.py +23 -7
- xinference/model/llm/reasoning_parser.py +281 -5
- xinference/model/llm/sglang/core.py +39 -11
- xinference/model/llm/transformers/chatglm.py +9 -2
- xinference/model/llm/transformers/cogagent.py +10 -12
- xinference/model/llm/transformers/cogvlm2.py +6 -3
- xinference/model/llm/transformers/cogvlm2_video.py +3 -6
- xinference/model/llm/transformers/core.py +58 -60
- xinference/model/llm/transformers/deepseek_v2.py +4 -2
- xinference/model/llm/transformers/deepseek_vl.py +10 -4
- xinference/model/llm/transformers/deepseek_vl2.py +9 -4
- xinference/model/llm/transformers/gemma3.py +4 -5
- xinference/model/llm/transformers/glm4v.py +3 -21
- xinference/model/llm/transformers/glm_edge_v.py +3 -20
- xinference/model/llm/transformers/intern_vl.py +3 -6
- xinference/model/llm/transformers/internlm2.py +1 -1
- xinference/model/llm/transformers/minicpmv25.py +4 -2
- xinference/model/llm/transformers/minicpmv26.py +5 -3
- xinference/model/llm/transformers/omnilmm.py +1 -1
- xinference/model/llm/transformers/opt.py +1 -1
- xinference/model/llm/transformers/ovis2.py +302 -0
- xinference/model/llm/transformers/qwen-omni.py +8 -1
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +5 -1
- xinference/model/llm/transformers/qwen_vl.py +5 -2
- xinference/model/llm/utils.py +96 -45
- xinference/model/llm/vllm/core.py +108 -24
- xinference/model/llm/vllm/distributed_executor.py +8 -7
- xinference/model/llm/vllm/xavier/allocator.py +1 -1
- xinference/model/llm/vllm/xavier/block_manager.py +1 -1
- xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
- xinference/model/llm/vllm/xavier/executor.py +1 -1
- xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
- xinference/model/rerank/__init__.py +13 -24
- xinference/model/video/__init__.py +15 -25
- xinference/model/video/core.py +3 -3
- xinference/model/video/diffusers.py +157 -13
- xinference/model/video/model_spec.json +100 -0
- xinference/model/video/model_spec_modelscope.json +104 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
- xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
- xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
- xinference/thirdparty/cosyvoice/bin/train.py +7 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
- xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
- xinference/thirdparty/cosyvoice/cli/model.py +140 -155
- xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
- xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
- xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
- xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
- xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
- xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
- xinference/thirdparty/cosyvoice/utils/common.py +1 -1
- xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
- xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
- xinference/types.py +2 -71
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
- xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
- xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
- xinference/web/ui/src/locales/en.json +7 -4
- xinference/web/ui/src/locales/zh.json +7 -4
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
- xinference/core/image_interface.py +0 -377
- xinference/model/llm/transformers/compression.py +0 -258
- xinference/model/llm/transformers/yi_vl.py +0 -239
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
- xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
- xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
- xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
- /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import os
|
|
15
|
+
from typing import Generator
|
|
14
16
|
import torch
|
|
15
17
|
import numpy as np
|
|
16
18
|
import threading
|
|
@@ -19,6 +21,7 @@ from torch.nn import functional as F
|
|
|
19
21
|
from contextlib import nullcontext
|
|
20
22
|
import uuid
|
|
21
23
|
from cosyvoice.utils.common import fade_in_out
|
|
24
|
+
from cosyvoice.utils.file_utils import convert_onnx_to_trt
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
class CosyVoiceModel:
|
|
@@ -27,17 +30,18 @@ class CosyVoiceModel:
|
|
|
27
30
|
llm: torch.nn.Module,
|
|
28
31
|
flow: torch.nn.Module,
|
|
29
32
|
hift: torch.nn.Module,
|
|
30
|
-
fp16: bool):
|
|
33
|
+
fp16: bool = False):
|
|
31
34
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
32
35
|
self.llm = llm
|
|
33
36
|
self.flow = flow
|
|
34
37
|
self.hift = hift
|
|
35
38
|
self.fp16 = fp16
|
|
39
|
+
if self.fp16 is True:
|
|
40
|
+
self.llm.half()
|
|
41
|
+
self.flow.half()
|
|
36
42
|
self.token_min_hop_len = 2 * self.flow.input_frame_rate
|
|
37
43
|
self.token_max_hop_len = 4 * self.flow.input_frame_rate
|
|
38
44
|
self.token_overlap_len = 20
|
|
39
|
-
# here we fix set flow.decoder.estimator.static_chunk_size = 0 for compatibability
|
|
40
|
-
self.flow.decoder.estimator.static_chunk_size = 0
|
|
41
45
|
# mel fade in out
|
|
42
46
|
self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
|
|
43
47
|
self.mel_window = np.hamming(2 * self.mel_overlap_len)
|
|
@@ -61,8 +65,6 @@ class CosyVoiceModel:
|
|
|
61
65
|
def load(self, llm_model, flow_model, hift_model):
|
|
62
66
|
self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
|
|
63
67
|
self.llm.to(self.device).eval()
|
|
64
|
-
if self.fp16 is True:
|
|
65
|
-
self.llm.half()
|
|
66
68
|
self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
|
|
67
69
|
self.flow.to(self.device).eval()
|
|
68
70
|
# in case hift_model is a hifigan model
|
|
@@ -71,7 +73,6 @@ class CosyVoiceModel:
|
|
|
71
73
|
self.hift.to(self.device).eval()
|
|
72
74
|
|
|
73
75
|
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
|
|
74
|
-
assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
|
|
75
76
|
llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
|
|
76
77
|
self.llm.text_encoder = llm_text_encoder
|
|
77
78
|
llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
|
|
@@ -79,39 +80,62 @@ class CosyVoiceModel:
|
|
|
79
80
|
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
|
80
81
|
self.flow.encoder = flow_encoder
|
|
81
82
|
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
83
|
+
def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
|
|
84
|
+
assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
|
|
85
|
+
if not os.path.exists(flow_decoder_estimator_model):
|
|
86
|
+
convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
|
|
87
|
+
if os.path.getsize(flow_decoder_estimator_model) == 0:
|
|
88
|
+
raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
|
|
88
89
|
del self.flow.decoder.estimator
|
|
89
|
-
|
|
90
|
+
import tensorrt as trt
|
|
91
|
+
with open(flow_decoder_estimator_model, 'rb') as f:
|
|
92
|
+
self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
|
|
93
|
+
assert self.flow.decoder.estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
|
|
94
|
+
self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
|
|
95
|
+
|
|
96
|
+
def get_trt_kwargs(self):
|
|
97
|
+
min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
|
|
98
|
+
opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200)]
|
|
99
|
+
max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
|
|
100
|
+
input_names = ["x", "mask", "mu", "cond"]
|
|
101
|
+
return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
|
|
90
102
|
|
|
91
103
|
def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
104
|
+
with self.llm_context, torch.cuda.amp.autocast(self.fp16):
|
|
105
|
+
if isinstance(text, Generator):
|
|
106
|
+
assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
|
|
107
|
+
for i in self.llm.inference_bistream(text=text,
|
|
108
|
+
prompt_text=prompt_text.to(self.device),
|
|
109
|
+
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
|
|
110
|
+
prompt_speech_token=llm_prompt_speech_token.to(self.device),
|
|
111
|
+
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
112
|
+
embedding=llm_embedding.to(self.device)):
|
|
113
|
+
self.tts_speech_token_dict[uuid].append(i)
|
|
114
|
+
else:
|
|
115
|
+
for i in self.llm.inference(text=text.to(self.device),
|
|
116
|
+
text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
|
|
117
|
+
prompt_text=prompt_text.to(self.device),
|
|
118
|
+
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
|
|
119
|
+
prompt_speech_token=llm_prompt_speech_token.to(self.device),
|
|
120
|
+
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
121
|
+
embedding=llm_embedding.to(self.device)):
|
|
122
|
+
self.tts_speech_token_dict[uuid].append(i)
|
|
123
|
+
self.llm_end_dict[uuid] = True
|
|
124
|
+
|
|
125
|
+
def vc_job(self, source_speech_token, uuid):
|
|
126
|
+
self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
|
|
103
127
|
self.llm_end_dict[uuid] = True
|
|
104
128
|
|
|
105
129
|
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
130
|
+
with torch.cuda.amp.autocast(self.fp16):
|
|
131
|
+
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
|
|
132
|
+
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
|
133
|
+
prompt_token=prompt_token.to(self.device),
|
|
134
|
+
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
135
|
+
prompt_feat=prompt_feat.to(self.device),
|
|
136
|
+
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
|
|
137
|
+
embedding=embedding.to(self.device),
|
|
138
|
+
flow_cache=self.flow_cache_dict[uuid])
|
|
115
139
|
|
|
116
140
|
# mel overlap fade in out
|
|
117
141
|
if self.mel_overlap_dict[uuid].shape[2] != 0:
|
|
@@ -142,11 +166,11 @@ class CosyVoiceModel:
|
|
|
142
166
|
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
|
143
167
|
return tts_speech
|
|
144
168
|
|
|
145
|
-
def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
|
169
|
+
def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
|
|
146
170
|
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
|
|
147
171
|
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
|
|
148
172
|
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
|
|
149
|
-
prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
|
|
173
|
+
prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
|
|
150
174
|
# this_uuid is used to track variables related to this inference thread
|
|
151
175
|
this_uuid = str(uuid.uuid1())
|
|
152
176
|
with self.lock:
|
|
@@ -154,7 +178,10 @@ class CosyVoiceModel:
|
|
|
154
178
|
self.hift_cache_dict[this_uuid] = None
|
|
155
179
|
self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
|
|
156
180
|
self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
|
|
157
|
-
|
|
181
|
+
if source_speech_token.shape[1] == 0:
|
|
182
|
+
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
|
|
183
|
+
else:
|
|
184
|
+
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
|
|
158
185
|
p.start()
|
|
159
186
|
if stream is True:
|
|
160
187
|
token_hop_len = self.token_min_hop_len
|
|
@@ -204,143 +231,89 @@ class CosyVoiceModel:
|
|
|
204
231
|
self.mel_overlap_dict.pop(this_uuid)
|
|
205
232
|
self.hift_cache_dict.pop(this_uuid)
|
|
206
233
|
self.flow_cache_dict.pop(this_uuid)
|
|
234
|
+
torch.cuda.empty_cache()
|
|
207
235
|
|
|
208
|
-
def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
|
|
209
|
-
# this_uuid is used to track variables related to this inference thread
|
|
210
|
-
this_uuid = str(uuid.uuid1())
|
|
211
|
-
with self.lock:
|
|
212
|
-
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
|
|
213
|
-
self.hift_cache_dict[this_uuid] = None
|
|
214
|
-
self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
|
|
215
|
-
self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
|
|
216
|
-
if stream is True:
|
|
217
|
-
token_hop_len = self.token_min_hop_len
|
|
218
|
-
while True:
|
|
219
|
-
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
|
|
220
|
-
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
|
|
221
|
-
.unsqueeze(dim=0)
|
|
222
|
-
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
223
|
-
prompt_token=flow_prompt_speech_token,
|
|
224
|
-
prompt_feat=prompt_speech_feat,
|
|
225
|
-
embedding=flow_embedding,
|
|
226
|
-
uuid=this_uuid,
|
|
227
|
-
finalize=False)
|
|
228
|
-
yield {'tts_speech': this_tts_speech.cpu()}
|
|
229
|
-
with self.lock:
|
|
230
|
-
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
|
|
231
|
-
# increase token_hop_len for better speech quality
|
|
232
|
-
token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
|
|
233
|
-
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
|
|
234
|
-
break
|
|
235
|
-
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
|
236
|
-
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
|
237
|
-
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
238
|
-
prompt_token=flow_prompt_speech_token,
|
|
239
|
-
prompt_feat=prompt_speech_feat,
|
|
240
|
-
embedding=flow_embedding,
|
|
241
|
-
uuid=this_uuid,
|
|
242
|
-
finalize=True)
|
|
243
|
-
yield {'tts_speech': this_tts_speech.cpu()}
|
|
244
|
-
else:
|
|
245
|
-
# deal with all tokens
|
|
246
|
-
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
|
247
|
-
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
248
|
-
prompt_token=flow_prompt_speech_token,
|
|
249
|
-
prompt_feat=prompt_speech_feat,
|
|
250
|
-
embedding=flow_embedding,
|
|
251
|
-
uuid=this_uuid,
|
|
252
|
-
finalize=True,
|
|
253
|
-
speed=speed)
|
|
254
|
-
yield {'tts_speech': this_tts_speech.cpu()}
|
|
255
|
-
with self.lock:
|
|
256
|
-
self.tts_speech_token_dict.pop(this_uuid)
|
|
257
|
-
self.llm_end_dict.pop(this_uuid)
|
|
258
|
-
self.mel_overlap_dict.pop(this_uuid)
|
|
259
|
-
self.hift_cache_dict.pop(this_uuid)
|
|
260
236
|
|
|
261
|
-
|
|
262
|
-
class CosyVoice2Model:
|
|
237
|
+
class CosyVoice2Model(CosyVoiceModel):
|
|
263
238
|
|
|
264
239
|
def __init__(self,
|
|
265
240
|
llm: torch.nn.Module,
|
|
266
241
|
flow: torch.nn.Module,
|
|
267
|
-
hift: torch.nn.Module
|
|
242
|
+
hift: torch.nn.Module,
|
|
243
|
+
fp16: bool = False,
|
|
244
|
+
use_flow_cache: bool = False):
|
|
268
245
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
269
246
|
self.llm = llm
|
|
270
247
|
self.flow = flow
|
|
271
248
|
self.hift = hift
|
|
272
|
-
self.
|
|
273
|
-
|
|
274
|
-
self.
|
|
275
|
-
|
|
249
|
+
self.fp16 = fp16
|
|
250
|
+
self.use_flow_cache = use_flow_cache
|
|
251
|
+
if self.fp16 is True:
|
|
252
|
+
self.llm.half()
|
|
253
|
+
self.flow.half()
|
|
254
|
+
# stream related params, check examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
|
|
255
|
+
self.token_hop_len = 25
|
|
256
|
+
self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
|
|
276
257
|
# hift cache
|
|
277
258
|
self.mel_cache_len = 8
|
|
278
259
|
self.source_cache_len = int(self.mel_cache_len * 480)
|
|
279
260
|
# speech fade in out
|
|
280
261
|
self.speech_window = np.hamming(2 * self.source_cache_len)
|
|
281
262
|
# rtf and decoding related
|
|
282
|
-
self.stream_scale_factor = 1
|
|
283
263
|
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
|
|
284
264
|
self.lock = threading.Lock()
|
|
285
265
|
# dict used to store session related variable
|
|
286
266
|
self.tts_speech_token_dict = {}
|
|
287
267
|
self.llm_end_dict = {}
|
|
268
|
+
self.flow_cache_dict = {}
|
|
288
269
|
self.hift_cache_dict = {}
|
|
289
270
|
|
|
290
|
-
def
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
271
|
+
def init_flow_cache(self):
|
|
272
|
+
encoder_cache = {'offset': 0,
|
|
273
|
+
'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
|
|
274
|
+
'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
|
|
275
|
+
'upsample_offset': 0,
|
|
276
|
+
'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
|
|
277
|
+
'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
|
|
278
|
+
decoder_cache = {'offset': 0,
|
|
279
|
+
'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
|
|
280
|
+
'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
|
|
281
|
+
'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
|
|
282
|
+
'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
|
|
283
|
+
'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
|
|
284
|
+
'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
|
|
285
|
+
'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
|
|
286
|
+
if self.fp16 is True:
|
|
287
|
+
for cache in [encoder_cache, decoder_cache]:
|
|
288
|
+
for k, v in cache.items():
|
|
289
|
+
if isinstance(v, torch.Tensor):
|
|
290
|
+
cache[k] = v.half()
|
|
291
|
+
cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
|
|
292
|
+
return cache
|
|
300
293
|
|
|
301
294
|
def load_jit(self, flow_encoder_model):
|
|
302
295
|
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
|
|
303
296
|
self.flow.encoder = flow_encoder
|
|
304
297
|
|
|
305
|
-
def
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
|
|
298
|
+
def get_trt_kwargs(self):
|
|
299
|
+
min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4), (1, 4, 2, 0, 512, 2), (12, 4, 2, 0, 512, 2), (1, 4, 2, 0, 512, 2)]
|
|
300
|
+
opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200), (1, 4, 2, 100, 512, 2), (12, 4, 2, 100, 512, 2), (1, 4, 2, 100, 512, 2)]
|
|
301
|
+
max_shape = [(2, 80, 1500), (2, 1, 1500), (2, 80, 1500), (2, 80, 1500), (1, 4, 2, 200, 512, 2), (12, 4, 2, 200, 512, 2), (1, 4, 2, 200, 512, 2)]
|
|
302
|
+
input_names = ["x", "mask", "mu", "cond", 'down_blocks_kv_cache', 'mid_blocks_kv_cache', 'up_blocks_kv_cache']
|
|
303
|
+
assert self.use_flow_cache is True, "get_trt_kwargs is set for flow cache mode. If you want to use trt with use_flow_cache=False, please set higher max_shape"
|
|
304
|
+
return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
|
|
313
305
|
|
|
314
|
-
def
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
|
|
326
|
-
prompt_text=prompt_text.to(self.device),
|
|
327
|
-
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
|
|
328
|
-
prompt_speech_token=llm_prompt_speech_token.to(self.device),
|
|
329
|
-
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
330
|
-
embedding=llm_embedding.to(self.device)):
|
|
331
|
-
self.tts_speech_token_dict[uuid].append(i)
|
|
332
|
-
self.llm_end_dict[uuid] = True
|
|
333
|
-
|
|
334
|
-
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
|
|
335
|
-
tts_mel, _ = self.flow.inference(token=token.to(self.device),
|
|
336
|
-
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
|
337
|
-
prompt_token=prompt_token.to(self.device),
|
|
338
|
-
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
339
|
-
prompt_feat=prompt_feat.to(self.device),
|
|
340
|
-
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
|
|
341
|
-
embedding=embedding.to(self.device),
|
|
342
|
-
finalize=finalize)
|
|
343
|
-
tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
|
|
306
|
+
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
|
|
307
|
+
with torch.cuda.amp.autocast(self.fp16):
|
|
308
|
+
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
|
|
309
|
+
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
|
|
310
|
+
prompt_token=prompt_token.to(self.device),
|
|
311
|
+
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
|
|
312
|
+
prompt_feat=prompt_feat.to(self.device),
|
|
313
|
+
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
|
|
314
|
+
embedding=embedding.to(self.device),
|
|
315
|
+
cache=self.flow_cache_dict[uuid],
|
|
316
|
+
finalize=finalize)
|
|
344
317
|
# append hift cache
|
|
345
318
|
if self.hift_cache_dict[uuid] is not None:
|
|
346
319
|
hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
|
|
@@ -365,34 +338,44 @@ class CosyVoice2Model:
|
|
|
365
338
|
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
|
366
339
|
return tts_speech
|
|
367
340
|
|
|
368
|
-
def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
|
341
|
+
def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
|
|
369
342
|
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
|
|
370
343
|
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
|
|
371
344
|
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
|
|
372
|
-
prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
|
|
345
|
+
prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
|
|
373
346
|
# this_uuid is used to track variables related to this inference thread
|
|
374
347
|
this_uuid = str(uuid.uuid1())
|
|
375
348
|
with self.lock:
|
|
376
349
|
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
|
|
377
350
|
self.hift_cache_dict[this_uuid] = None
|
|
378
|
-
|
|
351
|
+
self.flow_cache_dict[this_uuid] = self.init_flow_cache()
|
|
352
|
+
if source_speech_token.shape[1] == 0:
|
|
353
|
+
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
|
|
354
|
+
else:
|
|
355
|
+
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
|
|
379
356
|
p.start()
|
|
380
357
|
if stream is True:
|
|
381
|
-
|
|
358
|
+
assert self.use_flow_cache is True, "set use_flow_cache=True if you want to use stream inference to avoid OOM"
|
|
359
|
+
# NOTE in cache mode, trim flow_prompt to same size as flow_decoder_required_cache_size
|
|
360
|
+
flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
|
|
361
|
+
prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
|
|
382
362
|
while True:
|
|
383
363
|
time.sleep(0.1)
|
|
384
|
-
if len(self.tts_speech_token_dict[this_uuid])
|
|
385
|
-
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:
|
|
364
|
+
if len(self.tts_speech_token_dict[this_uuid]) >= self.token_hop_len + self.flow.pre_lookahead_len:
|
|
365
|
+
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
|
|
386
366
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
387
367
|
prompt_token=flow_prompt_speech_token,
|
|
388
368
|
prompt_feat=prompt_speech_feat,
|
|
389
369
|
embedding=flow_embedding,
|
|
390
370
|
uuid=this_uuid,
|
|
391
|
-
token_offset=token_offset,
|
|
392
371
|
finalize=False)
|
|
393
|
-
|
|
372
|
+
# NOTE in cache inference mode, we only use flow_prompt_speech_token/prompt_speech_feat in first chunk
|
|
373
|
+
flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
|
|
374
|
+
prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
|
|
394
375
|
yield {'tts_speech': this_tts_speech.cpu()}
|
|
395
|
-
|
|
376
|
+
with self.lock:
|
|
377
|
+
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
|
|
378
|
+
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
|
|
396
379
|
break
|
|
397
380
|
p.join()
|
|
398
381
|
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
|
@@ -402,11 +385,11 @@ class CosyVoice2Model:
|
|
|
402
385
|
prompt_feat=prompt_speech_feat,
|
|
403
386
|
embedding=flow_embedding,
|
|
404
387
|
uuid=this_uuid,
|
|
405
|
-
token_offset=token_offset,
|
|
406
388
|
finalize=True)
|
|
407
389
|
yield {'tts_speech': this_tts_speech.cpu()}
|
|
408
390
|
else:
|
|
409
391
|
# deal with all tokens
|
|
392
|
+
assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
|
|
410
393
|
p.join()
|
|
411
394
|
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
|
412
395
|
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
@@ -414,10 +397,12 @@ class CosyVoice2Model:
|
|
|
414
397
|
prompt_feat=prompt_speech_feat,
|
|
415
398
|
embedding=flow_embedding,
|
|
416
399
|
uuid=this_uuid,
|
|
417
|
-
token_offset=0,
|
|
418
400
|
finalize=True,
|
|
419
401
|
speed=speed)
|
|
420
402
|
yield {'tts_speech': this_tts_speech.cpu()}
|
|
421
403
|
with self.lock:
|
|
422
404
|
self.tts_speech_token_dict.pop(this_uuid)
|
|
423
405
|
self.llm_end_dict.pop(this_uuid)
|
|
406
|
+
self.hift_cache_dict.pop(this_uuid)
|
|
407
|
+
self.flow_cache_dict.pop(this_uuid)
|
|
408
|
+
torch.cuda.empty_cache()
|
|
@@ -20,8 +20,8 @@ import torch
|
|
|
20
20
|
import torchaudio
|
|
21
21
|
from torch.nn.utils.rnn import pad_sequence
|
|
22
22
|
import torch.nn.functional as F
|
|
23
|
+
import pyworld as pw
|
|
23
24
|
|
|
24
|
-
torchaudio.set_audio_backend('soundfile')
|
|
25
25
|
|
|
26
26
|
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
|
|
27
27
|
|
|
@@ -179,7 +179,7 @@ def compute_fbank(data,
|
|
|
179
179
|
yield sample
|
|
180
180
|
|
|
181
181
|
|
|
182
|
-
def compute_f0(data,
|
|
182
|
+
def compute_f0(data, sample_rate, hop_size, mode='train'):
|
|
183
183
|
""" Extract f0
|
|
184
184
|
|
|
185
185
|
Args:
|
|
@@ -188,15 +188,19 @@ def compute_f0(data, pitch_extractor, mode='train'):
|
|
|
188
188
|
Returns:
|
|
189
189
|
Iterable[{key, feat, label}]
|
|
190
190
|
"""
|
|
191
|
+
frame_period = hop_size * 1000 / sample_rate
|
|
191
192
|
for sample in data:
|
|
192
193
|
assert 'sample_rate' in sample
|
|
193
194
|
assert 'speech' in sample
|
|
194
195
|
assert 'utt' in sample
|
|
195
196
|
assert 'text_token' in sample
|
|
196
197
|
waveform = sample['speech']
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
198
|
+
_f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
|
|
199
|
+
if sum(_f0 != 0) < 5: # this happens when the algorithm fails
|
|
200
|
+
_f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
|
|
201
|
+
f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
|
|
202
|
+
f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
|
|
203
|
+
sample['pitch_feat'] = f0
|
|
200
204
|
yield sample
|
|
201
205
|
|
|
202
206
|
|