xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +107 -11
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/constants.py +5 -1
  5. xinference/core/media_interface.py +758 -0
  6. xinference/core/model.py +49 -9
  7. xinference/core/supervisor.py +1 -1
  8. xinference/core/utils.py +1 -1
  9. xinference/core/worker.py +33 -39
  10. xinference/deploy/cmdline.py +17 -0
  11. xinference/deploy/utils.py +0 -3
  12. xinference/model/audio/__init__.py +16 -27
  13. xinference/model/audio/core.py +2 -1
  14. xinference/model/audio/cosyvoice.py +4 -2
  15. xinference/model/audio/model_spec.json +63 -46
  16. xinference/model/audio/model_spec_modelscope.json +31 -14
  17. xinference/model/embedding/__init__.py +16 -24
  18. xinference/model/image/__init__.py +15 -25
  19. xinference/model/llm/__init__.py +40 -115
  20. xinference/model/llm/core.py +29 -6
  21. xinference/model/llm/llama_cpp/core.py +30 -347
  22. xinference/model/llm/llm_family.json +1674 -2203
  23. xinference/model/llm/llm_family.py +71 -7
  24. xinference/model/llm/llm_family_csghub.json +0 -32
  25. xinference/model/llm/llm_family_modelscope.json +1838 -2016
  26. xinference/model/llm/llm_family_openmind_hub.json +19 -325
  27. xinference/model/llm/lmdeploy/core.py +7 -2
  28. xinference/model/llm/mlx/core.py +23 -7
  29. xinference/model/llm/reasoning_parser.py +281 -5
  30. xinference/model/llm/sglang/core.py +39 -11
  31. xinference/model/llm/transformers/chatglm.py +9 -2
  32. xinference/model/llm/transformers/cogagent.py +10 -12
  33. xinference/model/llm/transformers/cogvlm2.py +6 -3
  34. xinference/model/llm/transformers/cogvlm2_video.py +3 -6
  35. xinference/model/llm/transformers/core.py +58 -60
  36. xinference/model/llm/transformers/deepseek_v2.py +4 -2
  37. xinference/model/llm/transformers/deepseek_vl.py +10 -4
  38. xinference/model/llm/transformers/deepseek_vl2.py +9 -4
  39. xinference/model/llm/transformers/gemma3.py +4 -5
  40. xinference/model/llm/transformers/glm4v.py +3 -21
  41. xinference/model/llm/transformers/glm_edge_v.py +3 -20
  42. xinference/model/llm/transformers/intern_vl.py +3 -6
  43. xinference/model/llm/transformers/internlm2.py +1 -1
  44. xinference/model/llm/transformers/minicpmv25.py +4 -2
  45. xinference/model/llm/transformers/minicpmv26.py +5 -3
  46. xinference/model/llm/transformers/omnilmm.py +1 -1
  47. xinference/model/llm/transformers/opt.py +1 -1
  48. xinference/model/llm/transformers/ovis2.py +302 -0
  49. xinference/model/llm/transformers/qwen-omni.py +8 -1
  50. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  51. xinference/model/llm/transformers/qwen2_vl.py +5 -1
  52. xinference/model/llm/transformers/qwen_vl.py +5 -2
  53. xinference/model/llm/utils.py +96 -45
  54. xinference/model/llm/vllm/core.py +108 -24
  55. xinference/model/llm/vllm/distributed_executor.py +8 -7
  56. xinference/model/llm/vllm/xavier/allocator.py +1 -1
  57. xinference/model/llm/vllm/xavier/block_manager.py +1 -1
  58. xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
  59. xinference/model/llm/vllm/xavier/executor.py +1 -1
  60. xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
  61. xinference/model/rerank/__init__.py +13 -24
  62. xinference/model/video/__init__.py +15 -25
  63. xinference/model/video/core.py +3 -3
  64. xinference/model/video/diffusers.py +157 -13
  65. xinference/model/video/model_spec.json +100 -0
  66. xinference/model/video/model_spec_modelscope.json +104 -0
  67. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  68. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  69. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  70. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  71. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  74. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  75. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  76. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  77. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  78. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  79. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  80. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  81. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  84. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  85. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  86. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  87. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  88. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  89. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  90. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  91. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  92. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  93. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  94. xinference/types.py +2 -71
  95. xinference/web/ui/build/asset-manifest.json +6 -6
  96. xinference/web/ui/build/index.html +1 -1
  97. xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
  98. xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
  99. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  100. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  101. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  102. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  103. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  109. xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
  110. xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
  111. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
  112. xinference/web/ui/src/locales/en.json +7 -4
  113. xinference/web/ui/src/locales/zh.json +7 -4
  114. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
  115. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
  116. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
  117. xinference/core/image_interface.py +0 -377
  118. xinference/model/llm/transformers/compression.py +0 -258
  119. xinference/model/llm/transformers/yi_vl.py +0 -239
  120. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  121. xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
  122. xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
  123. xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
  124. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  125. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
  126. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
  132. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  133. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
  134. /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  135. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
  136. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
  137. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  from functools import partial
15
+ from typing import Generator
15
16
  import json
16
17
  import onnxruntime
17
18
  import torch
@@ -31,7 +32,8 @@ except ImportError:
31
32
  from tn.chinese.normalizer import Normalizer as ZhNormalizer
32
33
  from tn.english.normalizer import Normalizer as EnNormalizer
33
34
  use_ttsfrd = False
34
- from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
35
+ from cosyvoice.utils.file_utils import logging
36
+ from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
35
37
 
36
38
 
37
39
  class CosyVoiceFrontEnd:
@@ -42,7 +44,6 @@ class CosyVoiceFrontEnd:
42
44
  campplus_model: str,
43
45
  speech_tokenizer_model: str,
44
46
  spk2info: str = '',
45
- instruct: bool = False,
46
47
  allowed_special: str = 'all'):
47
48
  self.tokenizer = get_tokenizer()
48
49
  self.feat_extractor = feat_extractor
@@ -58,9 +59,7 @@ class CosyVoiceFrontEnd:
58
59
  self.spk2info = torch.load(spk2info, map_location=self.device)
59
60
  else:
60
61
  self.spk2info = {}
61
- self.instruct = instruct
62
62
  self.allowed_special = allowed_special
63
- self.inflect_parser = inflect.engine()
64
63
  self.use_ttsfrd = use_ttsfrd
65
64
  if self.use_ttsfrd:
66
65
  self.frd = ttsfrd.TtsFrontendEngine()
@@ -69,14 +68,26 @@ class CosyVoiceFrontEnd:
69
68
  'failed to initialize ttsfrd resource'
70
69
  self.frd.set_lang_type('pinyinvg')
71
70
  else:
72
- self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
71
+ self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
73
72
  self.en_tn_model = EnNormalizer()
73
+ self.inflect_parser = inflect.engine()
74
74
 
75
75
  def _extract_text_token(self, text):
76
- text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
77
- text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
78
- text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
79
- return text_token, text_token_len
76
+ if isinstance(text, Generator):
77
+ logging.info('get tts_text generator, will return _extract_text_token_generator!')
78
+ # NOTE add a dummy text_token_len for compatibility
79
+ return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
80
+ else:
81
+ text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
82
+ text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
83
+ text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
84
+ return text_token, text_token_len
85
+
86
+ def _extract_text_token_generator(self, text_generator):
87
+ for text in text_generator:
88
+ text_token, _ = self._extract_text_token(text)
89
+ for i in range(text_token.shape[1]):
90
+ yield text_token[:, i: i + 1]
80
91
 
81
92
  def _extract_speech_token(self, speech):
82
93
  assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
@@ -108,14 +119,17 @@ class CosyVoiceFrontEnd:
108
119
  return speech_feat, speech_feat_len
109
120
 
110
121
  def text_normalize(self, text, split=True, text_frontend=True):
111
- if text_frontend is False:
122
+ if isinstance(text, Generator):
123
+ logging.info('get tts_text generator, will skip text_normalize!')
124
+ return [text]
125
+ if text_frontend is False or text == '':
112
126
  return [text] if split is True else text
113
127
  text = text.strip()
114
- if contains_chinese(text):
115
- if self.use_ttsfrd:
116
- texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
117
- text = ''.join(texts)
118
- else:
128
+ if self.use_ttsfrd:
129
+ texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
130
+ text = ''.join(texts)
131
+ else:
132
+ if contains_chinese(text):
119
133
  text = self.zh_tn_model.normalize(text)
120
134
  text = text.replace("\n", "")
121
135
  text = replace_blank(text)
@@ -126,18 +140,13 @@ class CosyVoiceFrontEnd:
126
140
  text = re.sub(r'[,,、]+$', '。', text)
127
141
  texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
128
142
  token_min_n=60, merge_len=20, comma_split=False))
129
- else:
130
- if self.use_ttsfrd:
131
- texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
132
- text = ''.join(texts)
133
143
  else:
134
144
  text = self.en_tn_model.normalize(text)
135
145
  text = spell_out_number(text, self.inflect_parser)
136
146
  texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
137
147
  token_min_n=60, merge_len=20, comma_split=False))
138
- if split is False:
139
- return text
140
- return texts
148
+ texts = [i for i in texts if not is_only_punctuation(i)]
149
+ return texts if split is True else text
141
150
 
142
151
  def frontend_sft(self, tts_text, spk_id):
143
152
  tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
@@ -145,28 +154,32 @@ class CosyVoiceFrontEnd:
145
154
  model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
146
155
  return model_input
147
156
 
148
- def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
157
+ def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
149
158
  tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
150
- prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
151
- prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
152
- speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
153
- speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
154
- if resample_rate == 24000:
155
- # cosyvoice2, force speech_feat % speech_token = 2
156
- token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
157
- speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
158
- speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
159
- embedding = self._extract_spk_embedding(prompt_speech_16k)
160
- model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
161
- 'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
162
- 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
163
- 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
164
- 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
165
- 'llm_embedding': embedding, 'flow_embedding': embedding}
159
+ if zero_shot_spk_id == '':
160
+ prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
161
+ prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
162
+ speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
163
+ speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
164
+ if resample_rate == 24000:
165
+ # cosyvoice2, force speech_feat % speech_token = 2
166
+ token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
167
+ speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
168
+ speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
169
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
170
+ model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
171
+ 'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
172
+ 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
173
+ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
174
+ 'llm_embedding': embedding, 'flow_embedding': embedding}
175
+ else:
176
+ model_input = self.spk2info[zero_shot_spk_id]
177
+ model_input['text'] = tts_text_token
178
+ model_input['text_len'] = tts_text_token_len
166
179
  return model_input
167
180
 
168
- def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
169
- model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
181
+ def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
182
+ model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)
170
183
  # in cross lingual mode, we remove prompt in llm
171
184
  del model_input['prompt_text']
172
185
  del model_input['prompt_text_len']
@@ -183,23 +196,10 @@ class CosyVoiceFrontEnd:
183
196
  model_input['prompt_text_len'] = instruct_text_token_len
184
197
  return model_input
185
198
 
186
- def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
187
- tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
188
- prompt_text_token, prompt_text_token_len = self._extract_text_token(instruct_text + '<|endofprompt|>')
189
- prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
190
- speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
191
- speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
192
- if resample_rate == 24000:
193
- # cosyvoice2, force speech_feat % speech_token = 2
194
- token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
195
- speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
196
- speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
197
- embedding = self._extract_spk_embedding(prompt_speech_16k)
198
- model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
199
- 'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
200
- 'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
201
- 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
202
- 'llm_embedding': embedding, 'flow_embedding': embedding}
199
+ def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
200
+ model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
201
+ del model_input['llm_prompt_speech_token']
202
+ del model_input['llm_prompt_speech_token_len']
203
203
  return model_input
204
204
 
205
205
  def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):