xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +79 -2
- xinference/client/restful/restful_client.py +65 -3
- xinference/conftest.py +0 -7
- xinference/core/media_interface.py +132 -8
- xinference/core/model.py +44 -6
- xinference/core/scheduler.py +1 -10
- xinference/core/supervisor.py +8 -17
- xinference/core/worker.py +5 -27
- xinference/deploy/cmdline.py +6 -2
- xinference/model/audio/chattts.py +24 -39
- xinference/model/audio/cosyvoice.py +18 -30
- xinference/model/audio/funasr.py +42 -0
- xinference/model/audio/model_spec.json +71 -1
- xinference/model/audio/model_spec_modelscope.json +76 -2
- xinference/model/audio/utils.py +75 -0
- xinference/model/core.py +1 -0
- xinference/model/embedding/__init__.py +74 -18
- xinference/model/embedding/core.py +98 -589
- xinference/model/embedding/embed_family.py +133 -0
- xinference/{thirdparty/omnilmm/train → model/embedding/flag}/__init__.py +1 -1
- xinference/model/embedding/flag/core.py +282 -0
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/sentence_transformers/__init__.py +13 -0
- xinference/model/embedding/sentence_transformers/core.py +399 -0
- xinference/model/embedding/vllm/core.py +95 -0
- xinference/model/image/model_spec.json +30 -3
- xinference/model/image/model_spec_modelscope.json +41 -2
- xinference/model/image/stable_diffusion/core.py +144 -53
- xinference/model/llm/__init__.py +6 -54
- xinference/model/llm/core.py +19 -5
- xinference/model/llm/llama_cpp/core.py +59 -3
- xinference/model/llm/llama_cpp/memory.py +457 -0
- xinference/model/llm/llm_family.json +247 -402
- xinference/model/llm/llm_family.py +88 -16
- xinference/model/llm/llm_family_modelscope.json +260 -421
- xinference/model/llm/llm_family_openmind_hub.json +0 -34
- xinference/model/llm/sglang/core.py +8 -0
- xinference/model/llm/transformers/__init__.py +27 -6
- xinference/model/llm/transformers/chatglm.py +4 -2
- xinference/model/llm/transformers/core.py +49 -28
- xinference/model/llm/transformers/deepseek_v2.py +6 -49
- xinference/model/llm/transformers/gemma3.py +119 -164
- xinference/model/llm/transformers/multimodal/__init__.py +13 -0
- xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} +58 -95
- xinference/model/llm/transformers/multimodal/core.py +205 -0
- xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} +59 -120
- xinference/model/llm/transformers/multimodal/gemma3.py +117 -0
- xinference/model/llm/transformers/{glm4v.py → multimodal/glm4v.py} +57 -93
- xinference/model/llm/transformers/multimodal/intern_vl.py +412 -0
- xinference/model/llm/transformers/{minicpmv26.py → multimodal/minicpmv26.py} +55 -102
- xinference/model/llm/transformers/{ovis2.py → multimodal/ovis2.py} +114 -175
- xinference/model/llm/transformers/{qwen-omni.py → multimodal/qwen-omni.py} +82 -167
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +131 -0
- xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} +224 -256
- xinference/model/llm/transformers/opt.py +4 -2
- xinference/model/llm/transformers/utils.py +6 -37
- xinference/model/llm/utils.py +11 -0
- xinference/model/llm/vllm/core.py +7 -0
- xinference/model/rerank/core.py +91 -3
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +24 -0
- xinference/model/rerank/utils.py +20 -2
- xinference/model/utils.py +38 -1
- xinference/model/video/diffusers.py +65 -3
- xinference/model/video/model_spec.json +31 -4
- xinference/model/video/model_spec_modelscope.json +32 -4
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.013f296b.css +2 -0
- xinference/web/ui/build/static/css/main.013f296b.css.map +1 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js +3 -0
- xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/567e49df411efb24425d289bb484758cb57067ca54f8b5c67fe4505f698deb96.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/6595880facebca7ceace6f17cf21c3a5a9219a2f52fb0ba9f3cf1131eddbcf6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/aa998bc2d9c11853add6b8a2e08f50327f56d8824ccaaec92d6dde1b305f0d85.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c748246b1d7bcebc16153be69f37e955bb2145526c47dd425aeeff70d3004dbc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e31234e95d60a5a7883fbcd70de2475dc1c88c90705df1a530abb68f86f80a51.json +1 -0
- xinference/web/ui/src/locales/en.json +21 -8
- xinference/web/ui/src/locales/ja.json +224 -0
- xinference/web/ui/src/locales/ko.json +224 -0
- xinference/web/ui/src/locales/zh.json +21 -8
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/METADATA +14 -11
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/RECORD +93 -100
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/WHEEL +1 -1
- xinference/model/llm/transformers/cogvlm2.py +0 -442
- xinference/model/llm/transformers/cogvlm2_video.py +0 -333
- xinference/model/llm/transformers/deepseek_vl.py +0 -280
- xinference/model/llm/transformers/glm_edge_v.py +0 -213
- xinference/model/llm/transformers/intern_vl.py +0 -526
- xinference/model/llm/transformers/internlm2.py +0 -94
- xinference/model/llm/transformers/minicpmv25.py +0 -193
- xinference/model/llm/transformers/omnilmm.py +0 -132
- xinference/model/llm/transformers/qwen2_audio.py +0 -179
- xinference/model/llm/transformers/qwen_vl.py +0 -360
- xinference/thirdparty/omnilmm/LICENSE +0 -201
- xinference/thirdparty/omnilmm/chat.py +0 -218
- xinference/thirdparty/omnilmm/constants.py +0 -4
- xinference/thirdparty/omnilmm/conversation.py +0 -332
- xinference/thirdparty/omnilmm/model/__init__.py +0 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +0 -595
- xinference/thirdparty/omnilmm/model/resampler.py +0 -166
- xinference/thirdparty/omnilmm/model/utils.py +0 -578
- xinference/thirdparty/omnilmm/train/train_utils.py +0 -150
- xinference/thirdparty/omnilmm/utils.py +0 -134
- xinference/web/ui/build/static/css/main.337afe76.css +0 -2
- xinference/web/ui/build/static/css/main.337afe76.css.map +0 -1
- xinference/web/ui/build/static/js/main.ae579a97.js +0 -3
- xinference/web/ui/build/static/js/main.ae579a97.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/dc249829767b8abcbc3677e0b07b6d3ecbfdfe6d08cfe23a665eb33373a9aa9d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f91af913d7f91c410719ab13136aaed3aaf0f8dda06652f25c42cb5231587398.json +0 -1
- /xinference/{thirdparty/omnilmm → model/embedding/vllm}/__init__.py +0 -0
- /xinference/web/ui/build/static/js/{main.ae579a97.js.LICENSE.txt → main.8a9e3ba0.js.LICENSE.txt} +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.6.0.post1.dist-info → xinference-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -71,9 +71,10 @@ class ChatTTSModel:
|
|
|
71
71
|
import ChatTTS
|
|
72
72
|
import numpy as np
|
|
73
73
|
import torch
|
|
74
|
-
import torchaudio
|
|
75
74
|
import xxhash
|
|
76
75
|
|
|
76
|
+
from .utils import audio_stream_generator, audio_to_bytes
|
|
77
|
+
|
|
77
78
|
rnd_spk_emb = None
|
|
78
79
|
|
|
79
80
|
if len(voice) > 400:
|
|
@@ -105,44 +106,28 @@ class ChatTTSModel:
|
|
|
105
106
|
)
|
|
106
107
|
|
|
107
108
|
assert self._model is not None
|
|
109
|
+
|
|
110
|
+
output = self._model.infer(
|
|
111
|
+
[input], params_infer_code=params_infer_code, stream=stream
|
|
112
|
+
)
|
|
108
113
|
if stream:
|
|
109
|
-
iter = self._model.infer(
|
|
110
|
-
[input], params_infer_code=params_infer_code, stream=True
|
|
111
|
-
)
|
|
112
114
|
|
|
113
|
-
def
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
out.seek(last_pos)
|
|
127
|
-
encoded_bytes = out.read()
|
|
128
|
-
yield encoded_bytes
|
|
129
|
-
last_pos = new_last_pos
|
|
130
|
-
|
|
131
|
-
return _generator()
|
|
115
|
+
def _gen_chunk():
|
|
116
|
+
for it in output:
|
|
117
|
+
for chunk in it:
|
|
118
|
+
yield chunk
|
|
119
|
+
|
|
120
|
+
return audio_stream_generator(
|
|
121
|
+
response_format=response_format,
|
|
122
|
+
sample_rate=24000,
|
|
123
|
+
output_generator=_gen_chunk(),
|
|
124
|
+
output_chunk_transformer=lambda c: torch.from_numpy(
|
|
125
|
+
np.array([c]).transpose()
|
|
126
|
+
),
|
|
127
|
+
)
|
|
132
128
|
else:
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
torchaudio.save(
|
|
139
|
-
out,
|
|
140
|
-
torch.from_numpy(wavs[0]).unsqueeze(0),
|
|
141
|
-
24000,
|
|
142
|
-
format=response_format,
|
|
143
|
-
)
|
|
144
|
-
except:
|
|
145
|
-
torchaudio.save(
|
|
146
|
-
out, torch.from_numpy(wavs[0]), 24000, format=response_format
|
|
147
|
-
)
|
|
148
|
-
return out.getvalue()
|
|
129
|
+
return audio_to_bytes(
|
|
130
|
+
response_format=response_format,
|
|
131
|
+
sample_rate=24000,
|
|
132
|
+
tensor=torch.from_numpy(output[0]).unsqueeze(0),
|
|
133
|
+
)
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import io
|
|
15
15
|
import logging
|
|
16
|
-
from io import BytesIO
|
|
17
16
|
from typing import TYPE_CHECKING, Optional
|
|
18
17
|
|
|
19
18
|
from ..utils import set_all_random_seed
|
|
@@ -132,36 +131,25 @@ class CosyVoiceModel:
|
|
|
132
131
|
output = self._model.inference_sft(input, voice, stream=stream)
|
|
133
132
|
|
|
134
133
|
import torch
|
|
135
|
-
import torchaudio
|
|
136
134
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
last_pos = new_last_pos
|
|
156
|
-
|
|
157
|
-
def _generator_block():
|
|
158
|
-
chunks = [o["tts_speech"] for o in output]
|
|
159
|
-
t = torch.cat(chunks, dim=1)
|
|
160
|
-
with BytesIO() as out:
|
|
161
|
-
torchaudio.save(out, t, self._model.sample_rate, format=response_format)
|
|
162
|
-
return out.getvalue()
|
|
163
|
-
|
|
164
|
-
return _generator_stream() if stream else _generator_block()
|
|
135
|
+
from .utils import audio_stream_generator, audio_to_bytes
|
|
136
|
+
|
|
137
|
+
return (
|
|
138
|
+
audio_stream_generator(
|
|
139
|
+
response_format=response_format,
|
|
140
|
+
sample_rate=self._model.sample_rate,
|
|
141
|
+
output_generator=output,
|
|
142
|
+
output_chunk_transformer=lambda c: torch.transpose(
|
|
143
|
+
c["tts_speech"], 0, 1
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
if stream
|
|
147
|
+
else audio_to_bytes(
|
|
148
|
+
response_format=response_format,
|
|
149
|
+
sample_rate=self._model.sample_rate,
|
|
150
|
+
tensor=torch.cat([o["tts_speech"] for o in output], dim=1),
|
|
151
|
+
)
|
|
152
|
+
)
|
|
165
153
|
|
|
166
154
|
def speech(
|
|
167
155
|
self,
|
xinference/model/audio/funasr.py
CHANGED
|
@@ -44,6 +44,44 @@ class FunASRModel:
|
|
|
44
44
|
def model_ability(self):
|
|
45
45
|
return self._model_spec.model_ability
|
|
46
46
|
|
|
47
|
+
def convert_to_openai_format(self, input_data):
|
|
48
|
+
if "timestamp" not in input_data:
|
|
49
|
+
return {"task": "transcribe", "text": input_data["text"]}
|
|
50
|
+
start_time = input_data["timestamp"][0][0] / 1000
|
|
51
|
+
end_time = input_data["timestamp"][-1][1] / 1000
|
|
52
|
+
duration = end_time - start_time
|
|
53
|
+
word_timestamps = []
|
|
54
|
+
for ts in input_data["timestamp"]:
|
|
55
|
+
word_timestamps.append({"start": ts[0] / 1000, "end": ts[1] / 1000})
|
|
56
|
+
if "sentence_info" not in input_data:
|
|
57
|
+
return {
|
|
58
|
+
"task": "transcribe",
|
|
59
|
+
"text": input_data["text"],
|
|
60
|
+
"words": word_timestamps,
|
|
61
|
+
"duration": duration,
|
|
62
|
+
}
|
|
63
|
+
output = {
|
|
64
|
+
"task": "transcribe",
|
|
65
|
+
"duration": duration,
|
|
66
|
+
"text": input_data["text"],
|
|
67
|
+
"words": word_timestamps,
|
|
68
|
+
"segments": [],
|
|
69
|
+
}
|
|
70
|
+
for sentence in input_data["sentence_info"]:
|
|
71
|
+
seg_start = sentence["start"] / 1000
|
|
72
|
+
seg_end = sentence["end"] / 1000
|
|
73
|
+
output["segments"].append(
|
|
74
|
+
{
|
|
75
|
+
"id": len(output["segments"]),
|
|
76
|
+
"start": seg_start,
|
|
77
|
+
"end": seg_end,
|
|
78
|
+
"text": sentence["text"],
|
|
79
|
+
"speaker": sentence["spk"],
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return output
|
|
84
|
+
|
|
47
85
|
def load(self):
|
|
48
86
|
try:
|
|
49
87
|
from funasr import AutoModel
|
|
@@ -103,6 +141,10 @@ class FunASRModel:
|
|
|
103
141
|
|
|
104
142
|
if response_format == "json":
|
|
105
143
|
return {"text": text}
|
|
144
|
+
elif response_format == "verbose_json":
|
|
145
|
+
verbose = result[0]
|
|
146
|
+
verbose["text"] = text
|
|
147
|
+
return self.convert_to_openai_format(verbose)
|
|
106
148
|
else:
|
|
107
149
|
raise ValueError(f"Unsupported response format: {response_format}")
|
|
108
150
|
|
|
@@ -218,13 +218,83 @@
|
|
|
218
218
|
"batch_size_s": 300
|
|
219
219
|
}
|
|
220
220
|
},
|
|
221
|
+
{
|
|
222
|
+
"model_name": "paraformer-zh-hotword",
|
|
223
|
+
"model_family": "funasr",
|
|
224
|
+
"model_id": "JunHowie/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
|
|
225
|
+
"model_revision": "26d622993683d7b0c517ee5ec9c1c8bdde76e324",
|
|
226
|
+
"model_ability": ["audio2text"],
|
|
227
|
+
"multilingual": false,
|
|
228
|
+
"default_model_config": {
|
|
229
|
+
"vad_model": "fsmn-vad",
|
|
230
|
+
"punc_model": "ct-punc"
|
|
231
|
+
},
|
|
232
|
+
"default_transcription_config": {
|
|
233
|
+
"hotword": "",
|
|
234
|
+
"batch_size_s": 300
|
|
235
|
+
}
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
"model_name": "paraformer-zh-long",
|
|
239
|
+
"model_family": "funasr",
|
|
240
|
+
"model_id": "JunHowie/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
241
|
+
"model_revision": "b6d8cb81645e34056cd3dda41e5624a740587de3",
|
|
242
|
+
"model_ability": ["audio2text"],
|
|
243
|
+
"multilingual": false,
|
|
244
|
+
"default_model_config": {
|
|
245
|
+
"vad_model": "fsmn-vad",
|
|
246
|
+
"punc_model": "ct-punc"
|
|
247
|
+
},
|
|
248
|
+
"default_transcription_config": {
|
|
249
|
+
"batch_size_s": 300
|
|
250
|
+
}
|
|
251
|
+
},
|
|
252
|
+
{
|
|
253
|
+
"model_name": "paraformer-zh-spk",
|
|
254
|
+
"model_family": "funasr",
|
|
255
|
+
"model_id": "JunHowie/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
|
|
256
|
+
"model_revision": "36abd64af4392fe02bf76453bc86c081cf1ca6da",
|
|
257
|
+
"model_ability": ["audio2text"],
|
|
258
|
+
"multilingual": false,
|
|
259
|
+
"default_model_config": {
|
|
260
|
+
"vad_model": "fsmn-vad",
|
|
261
|
+
"punc_model": "ct-punc",
|
|
262
|
+
"spk_model":"cam++"
|
|
263
|
+
},
|
|
264
|
+
"default_transcription_config": {
|
|
265
|
+
"batch_size_s": 300
|
|
266
|
+
}
|
|
267
|
+
},
|
|
268
|
+
{
|
|
269
|
+
"model_name": "seaco-paraformer-zh",
|
|
270
|
+
"model_family": "funasr",
|
|
271
|
+
"model_id": "JunHowie/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
272
|
+
"model_revision": "42e6be00854cf8de0f40002794f99df2a444fa97",
|
|
273
|
+
"model_ability": ["audio2text"],
|
|
274
|
+
"multilingual": false,
|
|
275
|
+
"default_model_config": {
|
|
276
|
+
"vad_model": "fsmn-vad",
|
|
277
|
+
"punc_model": "ct-punc"
|
|
278
|
+
},
|
|
279
|
+
"default_transcription_config": {
|
|
280
|
+
"hotword": "",
|
|
281
|
+
"batch_size_s": 300
|
|
282
|
+
}
|
|
283
|
+
},
|
|
221
284
|
{
|
|
222
285
|
"model_name": "ChatTTS",
|
|
223
286
|
"model_family": "ChatTTS",
|
|
224
287
|
"model_id": "2Noise/ChatTTS",
|
|
225
288
|
"model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
|
|
226
289
|
"model_ability": ["text2audio"],
|
|
227
|
-
"multilingual": true
|
|
290
|
+
"multilingual": true,
|
|
291
|
+
"virtualenv": {
|
|
292
|
+
"packages": [
|
|
293
|
+
"ChatTTS>=0.2.1",
|
|
294
|
+
"#system_torch#",
|
|
295
|
+
"#system_numpy#"
|
|
296
|
+
]
|
|
297
|
+
}
|
|
228
298
|
},
|
|
229
299
|
{
|
|
230
300
|
"model_name": "CosyVoice-300M",
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
"model_name": "paraformer-zh",
|
|
52
52
|
"model_family": "funasr",
|
|
53
53
|
"model_hub": "modelscope",
|
|
54
|
-
"model_id": "iic/speech_paraformer-
|
|
54
|
+
"model_id": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
55
55
|
"model_revision": "master",
|
|
56
56
|
"model_ability": ["audio2text"],
|
|
57
57
|
"multilingual": false,
|
|
@@ -63,6 +63,73 @@
|
|
|
63
63
|
"batch_size_s": 300
|
|
64
64
|
}
|
|
65
65
|
},
|
|
66
|
+
{
|
|
67
|
+
"model_name": "paraformer-zh-hotword",
|
|
68
|
+
"model_family": "funasr",
|
|
69
|
+
"model_hub": "modelscope",
|
|
70
|
+
"model_id": "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
|
|
71
|
+
"model_revision": "master",
|
|
72
|
+
"model_ability": ["audio2text"],
|
|
73
|
+
"multilingual": false,
|
|
74
|
+
"default_model_config": {
|
|
75
|
+
"vad_model": "fsmn-vad",
|
|
76
|
+
"punc_model": "ct-punc"
|
|
77
|
+
},
|
|
78
|
+
"default_transcription_config": {
|
|
79
|
+
"hotword": "",
|
|
80
|
+
"batch_size_s": 300
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"model_name": "paraformer-zh-long",
|
|
85
|
+
"model_family": "funasr",
|
|
86
|
+
"model_hub": "modelscope",
|
|
87
|
+
"model_id": "iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
88
|
+
"model_revision": "master",
|
|
89
|
+
"model_ability": ["audio2text"],
|
|
90
|
+
"multilingual": false,
|
|
91
|
+
"default_model_config": {
|
|
92
|
+
"vad_model": "fsmn-vad",
|
|
93
|
+
"punc_model": "ct-punc"
|
|
94
|
+
},
|
|
95
|
+
"default_transcription_config": {
|
|
96
|
+
"batch_size_s": 300
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"model_name": "paraformer-zh-spk",
|
|
101
|
+
"model_family": "funasr",
|
|
102
|
+
"model_hub": "modelscope",
|
|
103
|
+
"model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
|
|
104
|
+
"model_revision": "master",
|
|
105
|
+
"model_ability": ["audio2text"],
|
|
106
|
+
"multilingual": false,
|
|
107
|
+
"default_model_config": {
|
|
108
|
+
"vad_model": "fsmn-vad",
|
|
109
|
+
"punc_model": "ct-punc",
|
|
110
|
+
"spk_model":"cam++"
|
|
111
|
+
},
|
|
112
|
+
"default_transcription_config": {
|
|
113
|
+
"batch_size_s": 300
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"model_name": "seaco-paraformer-zh",
|
|
118
|
+
"model_family": "funasr",
|
|
119
|
+
"model_hub": "modelscope",
|
|
120
|
+
"model_id": "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
|
121
|
+
"model_revision": "master",
|
|
122
|
+
"model_ability": ["audio2text"],
|
|
123
|
+
"multilingual": false,
|
|
124
|
+
"default_model_config": {
|
|
125
|
+
"vad_model": "fsmn-vad",
|
|
126
|
+
"punc_model": "ct-punc"
|
|
127
|
+
},
|
|
128
|
+
"default_transcription_config": {
|
|
129
|
+
"hotword": "",
|
|
130
|
+
"batch_size_s": 300
|
|
131
|
+
}
|
|
132
|
+
},
|
|
66
133
|
{
|
|
67
134
|
"model_name": "ChatTTS",
|
|
68
135
|
"model_family": "ChatTTS",
|
|
@@ -70,7 +137,14 @@
|
|
|
70
137
|
"model_id": "AI-ModelScope/ChatTTS",
|
|
71
138
|
"model_revision": "master",
|
|
72
139
|
"model_ability": ["text2audio"],
|
|
73
|
-
"multilingual": true
|
|
140
|
+
"multilingual": true,
|
|
141
|
+
"virtualenv": {
|
|
142
|
+
"packages": [
|
|
143
|
+
"ChatTTS>=0.2.1",
|
|
144
|
+
"#system_torch#",
|
|
145
|
+
"#system_numpy#"
|
|
146
|
+
]
|
|
147
|
+
}
|
|
74
148
|
},
|
|
75
149
|
{
|
|
76
150
|
"model_name": "CosyVoice-300M",
|
xinference/model/audio/utils.py
CHANGED
|
@@ -13,16 +13,30 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import io
|
|
16
|
+
import logging
|
|
17
|
+
import types
|
|
18
|
+
import wave
|
|
19
|
+
from collections.abc import Callable
|
|
16
20
|
|
|
17
21
|
import numpy as np
|
|
22
|
+
import torch
|
|
18
23
|
|
|
19
24
|
from .core import AudioModelFamilyV1
|
|
20
25
|
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
21
28
|
|
|
22
29
|
def get_model_version(audio_model: AudioModelFamilyV1) -> str:
|
|
23
30
|
return audio_model.model_name
|
|
24
31
|
|
|
25
32
|
|
|
33
|
+
def _extract_pcm_from_wav_bytes(wav_bytes):
|
|
34
|
+
with io.BytesIO(wav_bytes) as wav_io:
|
|
35
|
+
with wave.open(wav_io, "rb") as wav_file:
|
|
36
|
+
num_frames = wav_file.getnframes()
|
|
37
|
+
return wav_file.readframes(num_frames)
|
|
38
|
+
|
|
39
|
+
|
|
26
40
|
def ensure_sample_rate(
|
|
27
41
|
audio: np.ndarray, old_sample_rate: int, sample_rate: int
|
|
28
42
|
) -> np.ndarray:
|
|
@@ -48,3 +62,64 @@ def ensure_sample_rate(
|
|
|
48
62
|
audio, sr = sf.read(buffer, dtype="float32")
|
|
49
63
|
|
|
50
64
|
return audio
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def audio_stream_generator(
|
|
68
|
+
response_format: str,
|
|
69
|
+
sample_rate: int,
|
|
70
|
+
output_generator: types.GeneratorType,
|
|
71
|
+
output_chunk_transformer: Callable,
|
|
72
|
+
):
|
|
73
|
+
import torch
|
|
74
|
+
import torchaudio
|
|
75
|
+
|
|
76
|
+
response_pcm = response_format.lower() == "pcm"
|
|
77
|
+
with io.BytesIO() as out:
|
|
78
|
+
if response_pcm:
|
|
79
|
+
logger.info(
|
|
80
|
+
f"PCM stream output, num_channels: 1, sample_rate: {sample_rate}"
|
|
81
|
+
)
|
|
82
|
+
writer = torchaudio.io.StreamWriter(out, format="wav")
|
|
83
|
+
writer.add_audio_stream(
|
|
84
|
+
sample_rate=sample_rate, num_channels=1, format="s16"
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
writer = torchaudio.io.StreamWriter(out, format=response_format)
|
|
88
|
+
writer.add_audio_stream(sample_rate=sample_rate, num_channels=1)
|
|
89
|
+
strip_header = True
|
|
90
|
+
last_pos = 0
|
|
91
|
+
with writer.open():
|
|
92
|
+
for chunk in output_generator:
|
|
93
|
+
trans_chunk = output_chunk_transformer(chunk)
|
|
94
|
+
if response_pcm:
|
|
95
|
+
trans_chunk = trans_chunk.to(torch.float32)
|
|
96
|
+
trans_chunk = (
|
|
97
|
+
(trans_chunk * 32767).clamp(-32768, 32767).to(torch.int16)
|
|
98
|
+
)
|
|
99
|
+
writer.write_audio_chunk(0, trans_chunk)
|
|
100
|
+
new_last_pos = out.tell()
|
|
101
|
+
if new_last_pos != last_pos:
|
|
102
|
+
out.seek(last_pos)
|
|
103
|
+
encoded_bytes = out.read()
|
|
104
|
+
if response_pcm and strip_header:
|
|
105
|
+
# http://soundfile.sapp.org/doc/WaveFormat
|
|
106
|
+
yield _extract_pcm_from_wav_bytes(encoded_bytes)
|
|
107
|
+
strip_header = False
|
|
108
|
+
else:
|
|
109
|
+
yield encoded_bytes
|
|
110
|
+
last_pos = new_last_pos
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def audio_to_bytes(response_format: str, sample_rate: int, tensor: "torch.Tensor"):
|
|
114
|
+
import torchaudio
|
|
115
|
+
|
|
116
|
+
response_pcm = response_format.lower() == "pcm"
|
|
117
|
+
with io.BytesIO() as out:
|
|
118
|
+
if response_pcm:
|
|
119
|
+
logger.info(f"PCM output, num_channels: 1, sample_rate: {sample_rate}")
|
|
120
|
+
torchaudio.save(out, tensor, sample_rate, format="wav", encoding="PCM_S")
|
|
121
|
+
# http://soundfile.sapp.org/doc/WaveFormat
|
|
122
|
+
return _extract_pcm_from_wav_bytes(out.getvalue())
|
|
123
|
+
else:
|
|
124
|
+
torchaudio.save(out, tensor, sample_rate, format=response_format)
|
|
125
|
+
return out.getvalue()
|
xinference/model/core.py
CHANGED
|
@@ -16,7 +16,7 @@ import codecs
|
|
|
16
16
|
import json
|
|
17
17
|
import os
|
|
18
18
|
import warnings
|
|
19
|
-
from typing import Any, Dict
|
|
19
|
+
from typing import Any, Dict, List
|
|
20
20
|
|
|
21
21
|
from .core import (
|
|
22
22
|
EMBEDDING_MODEL_DESCRIPTIONS,
|
|
@@ -32,9 +32,15 @@ from .custom import (
|
|
|
32
32
|
register_embedding,
|
|
33
33
|
unregister_embedding,
|
|
34
34
|
)
|
|
35
|
-
|
|
36
|
-
BUILTIN_EMBEDDING_MODELS
|
|
37
|
-
|
|
35
|
+
from .embed_family import (
|
|
36
|
+
BUILTIN_EMBEDDING_MODELS,
|
|
37
|
+
EMBEDDING_ENGINES,
|
|
38
|
+
FLAG_EMBEDDER_CLASSES,
|
|
39
|
+
MODELSCOPE_EMBEDDING_MODELS,
|
|
40
|
+
SENTENCE_TRANSFORMER_CLASSES,
|
|
41
|
+
SUPPORTED_ENGINES,
|
|
42
|
+
VLLM_CLASSES,
|
|
43
|
+
)
|
|
38
44
|
|
|
39
45
|
|
|
40
46
|
def register_custom_model():
|
|
@@ -55,12 +61,56 @@ def register_custom_model():
|
|
|
55
61
|
warnings.warn(f"{user_defined_embedding_dir}/{f} has error, {e}")
|
|
56
62
|
|
|
57
63
|
|
|
64
|
+
def generate_engine_config_by_model_name(model_spec: "EmbeddingModelSpec"):
|
|
65
|
+
model_name = model_spec.model_name
|
|
66
|
+
engines: Dict[str, List[Dict[str, Any]]] = EMBEDDING_ENGINES.get(
|
|
67
|
+
model_name, {}
|
|
68
|
+
) # structure for engine query
|
|
69
|
+
for engine in SUPPORTED_ENGINES:
|
|
70
|
+
CLASSES = SUPPORTED_ENGINES[engine]
|
|
71
|
+
for cls in CLASSES:
|
|
72
|
+
# Every engine needs to implement match method
|
|
73
|
+
if cls.match(model_spec):
|
|
74
|
+
# we only match the first class for an engine
|
|
75
|
+
engines[engine] = [
|
|
76
|
+
{
|
|
77
|
+
"model_name": model_name,
|
|
78
|
+
"embedding_class": cls,
|
|
79
|
+
}
|
|
80
|
+
]
|
|
81
|
+
break
|
|
82
|
+
EMBEDDING_ENGINES[model_name] = engines
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# will be called in xinference/model/__init__.py
|
|
58
86
|
def _install():
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
"model_spec_modelscope.json"
|
|
87
|
+
_model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
|
|
88
|
+
_model_spec_modelscope_json = os.path.join(
|
|
89
|
+
os.path.dirname(__file__), "model_spec_modelscope.json"
|
|
90
|
+
)
|
|
91
|
+
################### HuggingFace Model List Info Init ###################
|
|
92
|
+
BUILTIN_EMBEDDING_MODELS.update(
|
|
93
|
+
dict(
|
|
94
|
+
(spec["model_name"], EmbeddingModelSpec(**spec))
|
|
95
|
+
for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
for model_name, model_spec in BUILTIN_EMBEDDING_MODELS.items():
|
|
99
|
+
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
100
|
+
|
|
101
|
+
################### ModelScope Model List Info Init ###################
|
|
102
|
+
MODELSCOPE_EMBEDDING_MODELS.update(
|
|
103
|
+
dict(
|
|
104
|
+
(spec["model_name"], EmbeddingModelSpec(**spec))
|
|
105
|
+
for spec in json.load(
|
|
106
|
+
codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
|
|
107
|
+
)
|
|
108
|
+
)
|
|
62
109
|
)
|
|
110
|
+
for model_name, model_spec in MODELSCOPE_EMBEDDING_MODELS.items():
|
|
111
|
+
MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
|
|
63
112
|
|
|
113
|
+
# TODO: consider support more download hub in future...
|
|
64
114
|
# register model description after recording model revision
|
|
65
115
|
for model_spec_info in [BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS]:
|
|
66
116
|
for model_name, model_spec in model_spec_info.items():
|
|
@@ -77,16 +127,22 @@ def _install():
|
|
|
77
127
|
generate_embedding_description(ud_embedding)
|
|
78
128
|
)
|
|
79
129
|
|
|
130
|
+
from .flag.core import FlagEmbeddingModel
|
|
131
|
+
from .sentence_transformers.core import SentenceTransformerEmbeddingModel
|
|
132
|
+
from .vllm.core import VLLMEmbeddingModel
|
|
80
133
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
134
|
+
SENTENCE_TRANSFORMER_CLASSES.extend([SentenceTransformerEmbeddingModel])
|
|
135
|
+
FLAG_EMBEDDER_CLASSES.extend([FlagEmbeddingModel])
|
|
136
|
+
VLLM_CLASSES.extend([VLLMEmbeddingModel])
|
|
137
|
+
|
|
138
|
+
SUPPORTED_ENGINES["sentence_transformers"] = SENTENCE_TRANSFORMER_CLASSES
|
|
139
|
+
SUPPORTED_ENGINES["flag"] = FLAG_EMBEDDER_CLASSES
|
|
140
|
+
SUPPORTED_ENGINES["vllm"] = VLLM_CLASSES
|
|
141
|
+
|
|
142
|
+
# Init embedding engine
|
|
143
|
+
for model_infos in [BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS]:
|
|
144
|
+
for model_spec in model_infos.values():
|
|
145
|
+
generate_engine_config_by_model_name(model_spec)
|
|
91
146
|
|
|
92
|
-
del
|
|
147
|
+
del _model_spec_json
|
|
148
|
+
del _model_spec_modelscope_json
|