xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +107 -11
- xinference/client/restful/restful_client.py +51 -11
- xinference/constants.py +5 -1
- xinference/core/media_interface.py +758 -0
- xinference/core/model.py +49 -9
- xinference/core/supervisor.py +1 -1
- xinference/core/utils.py +1 -1
- xinference/core/worker.py +33 -39
- xinference/deploy/cmdline.py +17 -0
- xinference/deploy/utils.py +0 -3
- xinference/model/audio/__init__.py +16 -27
- xinference/model/audio/core.py +2 -1
- xinference/model/audio/cosyvoice.py +4 -2
- xinference/model/audio/model_spec.json +63 -46
- xinference/model/audio/model_spec_modelscope.json +31 -14
- xinference/model/embedding/__init__.py +16 -24
- xinference/model/image/__init__.py +15 -25
- xinference/model/llm/__init__.py +40 -115
- xinference/model/llm/core.py +29 -6
- xinference/model/llm/llama_cpp/core.py +30 -347
- xinference/model/llm/llm_family.json +1674 -2203
- xinference/model/llm/llm_family.py +71 -7
- xinference/model/llm/llm_family_csghub.json +0 -32
- xinference/model/llm/llm_family_modelscope.json +1838 -2016
- xinference/model/llm/llm_family_openmind_hub.json +19 -325
- xinference/model/llm/lmdeploy/core.py +7 -2
- xinference/model/llm/mlx/core.py +23 -7
- xinference/model/llm/reasoning_parser.py +281 -5
- xinference/model/llm/sglang/core.py +39 -11
- xinference/model/llm/transformers/chatglm.py +9 -2
- xinference/model/llm/transformers/cogagent.py +10 -12
- xinference/model/llm/transformers/cogvlm2.py +6 -3
- xinference/model/llm/transformers/cogvlm2_video.py +3 -6
- xinference/model/llm/transformers/core.py +58 -60
- xinference/model/llm/transformers/deepseek_v2.py +4 -2
- xinference/model/llm/transformers/deepseek_vl.py +10 -4
- xinference/model/llm/transformers/deepseek_vl2.py +9 -4
- xinference/model/llm/transformers/gemma3.py +4 -5
- xinference/model/llm/transformers/glm4v.py +3 -21
- xinference/model/llm/transformers/glm_edge_v.py +3 -20
- xinference/model/llm/transformers/intern_vl.py +3 -6
- xinference/model/llm/transformers/internlm2.py +1 -1
- xinference/model/llm/transformers/minicpmv25.py +4 -2
- xinference/model/llm/transformers/minicpmv26.py +5 -3
- xinference/model/llm/transformers/omnilmm.py +1 -1
- xinference/model/llm/transformers/opt.py +1 -1
- xinference/model/llm/transformers/ovis2.py +302 -0
- xinference/model/llm/transformers/qwen-omni.py +8 -1
- xinference/model/llm/transformers/qwen2_audio.py +3 -1
- xinference/model/llm/transformers/qwen2_vl.py +5 -1
- xinference/model/llm/transformers/qwen_vl.py +5 -2
- xinference/model/llm/utils.py +96 -45
- xinference/model/llm/vllm/core.py +108 -24
- xinference/model/llm/vllm/distributed_executor.py +8 -7
- xinference/model/llm/vllm/xavier/allocator.py +1 -1
- xinference/model/llm/vllm/xavier/block_manager.py +1 -1
- xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
- xinference/model/llm/vllm/xavier/executor.py +1 -1
- xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
- xinference/model/rerank/__init__.py +13 -24
- xinference/model/video/__init__.py +15 -25
- xinference/model/video/core.py +3 -3
- xinference/model/video/diffusers.py +157 -13
- xinference/model/video/model_spec.json +100 -0
- xinference/model/video/model_spec_modelscope.json +104 -0
- xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
- xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
- xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
- xinference/thirdparty/cosyvoice/bin/train.py +7 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
- xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
- xinference/thirdparty/cosyvoice/cli/model.py +140 -155
- xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
- xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
- xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
- xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
- xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
- xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
- xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
- xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
- xinference/thirdparty/cosyvoice/utils/common.py +1 -1
- xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
- xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
- xinference/types.py +2 -71
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
- xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
- xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
- xinference/web/ui/src/locales/en.json +7 -4
- xinference/web/ui/src/locales/zh.json +7 -4
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
- xinference/core/image_interface.py +0 -377
- xinference/model/llm/transformers/compression.py +0 -258
- xinference/model/llm/transformers/yi_vl.py +0 -239
- xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
- xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
- xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
- xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
- /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
from functools import partial
|
|
15
|
+
from typing import Generator
|
|
15
16
|
import json
|
|
16
17
|
import onnxruntime
|
|
17
18
|
import torch
|
|
@@ -31,7 +32,8 @@ except ImportError:
|
|
|
31
32
|
from tn.chinese.normalizer import Normalizer as ZhNormalizer
|
|
32
33
|
from tn.english.normalizer import Normalizer as EnNormalizer
|
|
33
34
|
use_ttsfrd = False
|
|
34
|
-
from cosyvoice.utils.
|
|
35
|
+
from cosyvoice.utils.file_utils import logging
|
|
36
|
+
from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
class CosyVoiceFrontEnd:
|
|
@@ -42,7 +44,6 @@ class CosyVoiceFrontEnd:
|
|
|
42
44
|
campplus_model: str,
|
|
43
45
|
speech_tokenizer_model: str,
|
|
44
46
|
spk2info: str = '',
|
|
45
|
-
instruct: bool = False,
|
|
46
47
|
allowed_special: str = 'all'):
|
|
47
48
|
self.tokenizer = get_tokenizer()
|
|
48
49
|
self.feat_extractor = feat_extractor
|
|
@@ -58,9 +59,7 @@ class CosyVoiceFrontEnd:
|
|
|
58
59
|
self.spk2info = torch.load(spk2info, map_location=self.device)
|
|
59
60
|
else:
|
|
60
61
|
self.spk2info = {}
|
|
61
|
-
self.instruct = instruct
|
|
62
62
|
self.allowed_special = allowed_special
|
|
63
|
-
self.inflect_parser = inflect.engine()
|
|
64
63
|
self.use_ttsfrd = use_ttsfrd
|
|
65
64
|
if self.use_ttsfrd:
|
|
66
65
|
self.frd = ttsfrd.TtsFrontendEngine()
|
|
@@ -69,14 +68,26 @@ class CosyVoiceFrontEnd:
|
|
|
69
68
|
'failed to initialize ttsfrd resource'
|
|
70
69
|
self.frd.set_lang_type('pinyinvg')
|
|
71
70
|
else:
|
|
72
|
-
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
|
|
71
|
+
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
|
|
73
72
|
self.en_tn_model = EnNormalizer()
|
|
73
|
+
self.inflect_parser = inflect.engine()
|
|
74
74
|
|
|
75
75
|
def _extract_text_token(self, text):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
76
|
+
if isinstance(text, Generator):
|
|
77
|
+
logging.info('get tts_text generator, will return _extract_text_token_generator!')
|
|
78
|
+
# NOTE add a dummy text_token_len for compatibility
|
|
79
|
+
return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
|
|
80
|
+
else:
|
|
81
|
+
text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
|
|
82
|
+
text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
|
|
83
|
+
text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
|
|
84
|
+
return text_token, text_token_len
|
|
85
|
+
|
|
86
|
+
def _extract_text_token_generator(self, text_generator):
|
|
87
|
+
for text in text_generator:
|
|
88
|
+
text_token, _ = self._extract_text_token(text)
|
|
89
|
+
for i in range(text_token.shape[1]):
|
|
90
|
+
yield text_token[:, i: i + 1]
|
|
80
91
|
|
|
81
92
|
def _extract_speech_token(self, speech):
|
|
82
93
|
assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
|
|
@@ -108,14 +119,17 @@ class CosyVoiceFrontEnd:
|
|
|
108
119
|
return speech_feat, speech_feat_len
|
|
109
120
|
|
|
110
121
|
def text_normalize(self, text, split=True, text_frontend=True):
|
|
111
|
-
if
|
|
122
|
+
if isinstance(text, Generator):
|
|
123
|
+
logging.info('get tts_text generator, will skip text_normalize!')
|
|
124
|
+
return [text]
|
|
125
|
+
if text_frontend is False or text == '':
|
|
112
126
|
return [text] if split is True else text
|
|
113
127
|
text = text.strip()
|
|
114
|
-
if
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
128
|
+
if self.use_ttsfrd:
|
|
129
|
+
texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
|
|
130
|
+
text = ''.join(texts)
|
|
131
|
+
else:
|
|
132
|
+
if contains_chinese(text):
|
|
119
133
|
text = self.zh_tn_model.normalize(text)
|
|
120
134
|
text = text.replace("\n", "")
|
|
121
135
|
text = replace_blank(text)
|
|
@@ -126,18 +140,13 @@ class CosyVoiceFrontEnd:
|
|
|
126
140
|
text = re.sub(r'[,,、]+$', '。', text)
|
|
127
141
|
texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
|
|
128
142
|
token_min_n=60, merge_len=20, comma_split=False))
|
|
129
|
-
else:
|
|
130
|
-
if self.use_ttsfrd:
|
|
131
|
-
texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
|
|
132
|
-
text = ''.join(texts)
|
|
133
143
|
else:
|
|
134
144
|
text = self.en_tn_model.normalize(text)
|
|
135
145
|
text = spell_out_number(text, self.inflect_parser)
|
|
136
146
|
texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
|
|
137
147
|
token_min_n=60, merge_len=20, comma_split=False))
|
|
138
|
-
if
|
|
139
|
-
|
|
140
|
-
return texts
|
|
148
|
+
texts = [i for i in texts if not is_only_punctuation(i)]
|
|
149
|
+
return texts if split is True else text
|
|
141
150
|
|
|
142
151
|
def frontend_sft(self, tts_text, spk_id):
|
|
143
152
|
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
|
|
@@ -145,28 +154,32 @@ class CosyVoiceFrontEnd:
|
|
|
145
154
|
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
|
|
146
155
|
return model_input
|
|
147
156
|
|
|
148
|
-
def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
|
|
157
|
+
def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
|
149
158
|
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
159
|
+
if zero_shot_spk_id == '':
|
|
160
|
+
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
|
|
161
|
+
prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
|
|
162
|
+
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
|
163
|
+
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
|
164
|
+
if resample_rate == 24000:
|
|
165
|
+
# cosyvoice2, force speech_feat % speech_token = 2
|
|
166
|
+
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
|
|
167
|
+
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
|
|
168
|
+
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
|
|
169
|
+
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
|
170
|
+
model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
|
171
|
+
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
|
|
172
|
+
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
|
|
173
|
+
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
|
174
|
+
'llm_embedding': embedding, 'flow_embedding': embedding}
|
|
175
|
+
else:
|
|
176
|
+
model_input = self.spk2info[zero_shot_spk_id]
|
|
177
|
+
model_input['text'] = tts_text_token
|
|
178
|
+
model_input['text_len'] = tts_text_token_len
|
|
166
179
|
return model_input
|
|
167
180
|
|
|
168
|
-
def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
|
|
169
|
-
model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
|
|
181
|
+
def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
|
182
|
+
model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
|
|
170
183
|
# in cross lingual mode, we remove prompt in llm
|
|
171
184
|
del model_input['prompt_text']
|
|
172
185
|
del model_input['prompt_text_len']
|
|
@@ -183,23 +196,10 @@ class CosyVoiceFrontEnd:
|
|
|
183
196
|
model_input['prompt_text_len'] = instruct_text_token_len
|
|
184
197
|
return model_input
|
|
185
198
|
|
|
186
|
-
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
|
|
191
|
-
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
|
192
|
-
if resample_rate == 24000:
|
|
193
|
-
# cosyvoice2, force speech_feat % speech_token = 2
|
|
194
|
-
token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
|
|
195
|
-
speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
|
|
196
|
-
speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
|
|
197
|
-
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
|
198
|
-
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
|
|
199
|
-
'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
|
|
200
|
-
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
|
|
201
|
-
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
|
202
|
-
'llm_embedding': embedding, 'flow_embedding': embedding}
|
|
199
|
+
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
|
|
200
|
+
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
|
|
201
|
+
del model_input['llm_prompt_speech_token']
|
|
202
|
+
del model_input['llm_prompt_speech_token_len']
|
|
203
203
|
return model_input
|
|
204
204
|
|
|
205
205
|
def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
|