xinference 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +1 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +54 -1
- xinference/client/restful/restful_client.py +82 -2
- xinference/constants.py +3 -0
- xinference/core/chat_interface.py +297 -83
- xinference/core/model.py +24 -3
- xinference/core/progress_tracker.py +16 -8
- xinference/core/supervisor.py +51 -1
- xinference/core/worker.py +315 -47
- xinference/deploy/cmdline.py +33 -1
- xinference/model/audio/core.py +11 -1
- xinference/model/audio/megatts.py +105 -0
- xinference/model/audio/model_spec.json +24 -1
- xinference/model/audio/model_spec_modelscope.json +26 -1
- xinference/model/core.py +14 -0
- xinference/model/embedding/core.py +6 -1
- xinference/model/flexible/core.py +6 -1
- xinference/model/image/core.py +6 -1
- xinference/model/image/model_spec.json +17 -1
- xinference/model/image/model_spec_modelscope.json +17 -1
- xinference/model/llm/__init__.py +4 -6
- xinference/model/llm/core.py +5 -0
- xinference/model/llm/llama_cpp/core.py +46 -17
- xinference/model/llm/llm_family.json +530 -85
- xinference/model/llm/llm_family.py +24 -1
- xinference/model/llm/llm_family_modelscope.json +572 -1
- xinference/model/llm/mlx/core.py +16 -2
- xinference/model/llm/reasoning_parser.py +3 -3
- xinference/model/llm/sglang/core.py +111 -13
- xinference/model/llm/transformers/__init__.py +14 -0
- xinference/model/llm/transformers/core.py +31 -6
- xinference/model/llm/transformers/deepseek_vl.py +1 -1
- xinference/model/llm/transformers/deepseek_vl2.py +287 -0
- xinference/model/llm/transformers/gemma3.py +17 -2
- xinference/model/llm/transformers/intern_vl.py +28 -18
- xinference/model/llm/transformers/minicpmv26.py +21 -2
- xinference/model/llm/transformers/qwen-omni.py +308 -0
- xinference/model/llm/transformers/qwen2_audio.py +1 -1
- xinference/model/llm/transformers/qwen2_vl.py +20 -4
- xinference/model/llm/utils.py +37 -15
- xinference/model/llm/vllm/core.py +184 -8
- xinference/model/llm/vllm/distributed_executor.py +320 -0
- xinference/model/rerank/core.py +22 -12
- xinference/model/utils.py +118 -1
- xinference/model/video/core.py +6 -1
- xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
- xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
- xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
- xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
- xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
- xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
- xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
- xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
- xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
- xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
- xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
- xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
- xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
- xinference/thirdparty/megatts3/__init__.py +0 -0
- xinference/thirdparty/megatts3/tts/frontend_function.py +175 -0
- xinference/thirdparty/megatts3/tts/gradio_api.py +93 -0
- xinference/thirdparty/megatts3/tts/infer_cli.py +277 -0
- xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py +318 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/ar_dur_predictor.py +362 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/layers.py +64 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/nar_tts_modules.py +73 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rel_transformer.py +403 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rot_transformer.py +649 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/seq_utils.py +342 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/transformer.py +767 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/cfm.py +309 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/dit.py +180 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/time_embedding.py +44 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/transformer.py +230 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/diag_gaussian.py +67 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/hifigan_modules.py +283 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/seanet_encoder.py +38 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/wavvae_v3.py +60 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/conv.py +154 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/lstm.py +51 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/seanet.py +126 -0
- xinference/thirdparty/megatts3/tts/utils/audio_utils/align.py +36 -0
- xinference/thirdparty/megatts3/tts/utils/audio_utils/io.py +95 -0
- xinference/thirdparty/megatts3/tts/utils/audio_utils/plot.py +90 -0
- xinference/thirdparty/megatts3/tts/utils/commons/ckpt_utils.py +171 -0
- xinference/thirdparty/megatts3/tts/utils/commons/hparams.py +215 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/dict.json +1 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/ph_tone_convert.py +94 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/split_text.py +90 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/text_encoder.py +280 -0
- xinference/types.py +10 -0
- xinference/utils.py +54 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.0f6523be.css +2 -0
- xinference/web/ui/build/static/css/main.0f6523be.css.map +1 -0
- xinference/web/ui/build/static/js/main.58bd483c.js +3 -0
- xinference/web/ui/build/static/js/main.58bd483c.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3bff8cbe9141f937f4d98879a9771b0f48e0e4e0dbee8e647adbfe23859e7048.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4500b1a622a031011f0a291701e306b87e08cbc749c50e285103536b85b6a914.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/69081049f0c7447544b7cfd73dd13d8846c02fe5febe4d81587e95c89a412d5b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bf2b211b0d1b6465eff512d64c869d748f803c5651a7c24e48de6ea3484a7bfe.json +1 -0
- xinference/web/ui/src/locales/en.json +2 -1
- xinference/web/ui/src/locales/zh.json +2 -1
- {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/METADATA +128 -115
- {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/RECORD +124 -63
- {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/WHEEL +1 -1
- xinference/web/ui/build/static/css/main.b494ae7e.css +0 -2
- xinference/web/ui/build/static/css/main.b494ae7e.css.map +0 -1
- xinference/web/ui/build/static/js/main.3cea968e.js +0 -3
- xinference/web/ui/build/static/js/main.3cea968e.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +0 -1
- /xinference/web/ui/build/static/js/{main.3cea968e.js.LICENSE.txt → main.58bd483c.js.LICENSE.txt} +0 -0
- {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info/licenses}/LICENSE +0 -0
- {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/top_level.txt +0 -0
xinference/model/audio/core.py
CHANGED
|
@@ -17,7 +17,7 @@ from collections import defaultdict
|
|
|
17
17
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
18
18
|
|
|
19
19
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
20
|
-
from ..core import CacheableModelSpec, ModelDescription
|
|
20
|
+
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
21
21
|
from ..utils import valid_model_revision
|
|
22
22
|
from .chattts import ChatTTSModel
|
|
23
23
|
from .cosyvoice import CosyVoiceModel
|
|
@@ -26,6 +26,7 @@ from .f5tts_mlx import F5TTSMLXModel
|
|
|
26
26
|
from .fish_speech import FishSpeechModel
|
|
27
27
|
from .funasr import FunASRModel
|
|
28
28
|
from .kokoro import KokoroModel
|
|
29
|
+
from .megatts import MegaTTSModel
|
|
29
30
|
from .melotts import MeloTTSModel
|
|
30
31
|
from .whisper import WhisperModel
|
|
31
32
|
from .whisper_mlx import WhisperMLXModel
|
|
@@ -55,6 +56,7 @@ class AudioModelFamilyV1(CacheableModelSpec):
|
|
|
55
56
|
default_model_config: Optional[Dict[str, Any]]
|
|
56
57
|
default_transcription_config: Optional[Dict[str, Any]]
|
|
57
58
|
engine: Optional[str]
|
|
59
|
+
virtualenv: Optional[VirtualEnvSettings]
|
|
58
60
|
|
|
59
61
|
|
|
60
62
|
class AudioModelDescription(ModelDescription):
|
|
@@ -68,6 +70,10 @@ class AudioModelDescription(ModelDescription):
|
|
|
68
70
|
super().__init__(address, devices, model_path=model_path)
|
|
69
71
|
self._model_spec = model_spec
|
|
70
72
|
|
|
73
|
+
@property
|
|
74
|
+
def spec(self):
|
|
75
|
+
return self._model_spec
|
|
76
|
+
|
|
71
77
|
def to_dict(self):
|
|
72
78
|
return {
|
|
73
79
|
"model_type": "audio",
|
|
@@ -178,6 +184,7 @@ def create_audio_model_instance(
|
|
|
178
184
|
F5TTSMLXModel,
|
|
179
185
|
MeloTTSModel,
|
|
180
186
|
KokoroModel,
|
|
187
|
+
MegaTTSModel,
|
|
181
188
|
],
|
|
182
189
|
AudioModelDescription,
|
|
183
190
|
]:
|
|
@@ -195,6 +202,7 @@ def create_audio_model_instance(
|
|
|
195
202
|
F5TTSMLXModel,
|
|
196
203
|
MeloTTSModel,
|
|
197
204
|
KokoroModel,
|
|
205
|
+
MegaTTSModel,
|
|
198
206
|
]
|
|
199
207
|
if model_spec.model_family == "whisper":
|
|
200
208
|
if not model_spec.engine:
|
|
@@ -217,6 +225,8 @@ def create_audio_model_instance(
|
|
|
217
225
|
model = MeloTTSModel(model_uid, model_path, model_spec, **kwargs)
|
|
218
226
|
elif model_spec.model_family == "Kokoro":
|
|
219
227
|
model = KokoroModel(model_uid, model_path, model_spec, **kwargs)
|
|
228
|
+
elif model_spec.model_family == "MegaTTS":
|
|
229
|
+
model = MegaTTSModel(model_uid, model_path, model_spec, **kwargs)
|
|
220
230
|
else:
|
|
221
231
|
raise Exception(f"Unsupported audio model family: {model_spec.model_family}")
|
|
222
232
|
model_description = AudioModelDescription(
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import io
|
|
15
|
+
import logging
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import TYPE_CHECKING, Optional
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from .core import AudioModelFamilyV1
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MegaTTSModel:
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
model_uid: str,
|
|
29
|
+
model_path: str,
|
|
30
|
+
model_spec: "AudioModelFamilyV1",
|
|
31
|
+
device: Optional[str] = None,
|
|
32
|
+
**kwargs,
|
|
33
|
+
):
|
|
34
|
+
self._model_uid = model_uid
|
|
35
|
+
self._model_path = model_path
|
|
36
|
+
self._model_spec = model_spec
|
|
37
|
+
self._device = device
|
|
38
|
+
self._model = None
|
|
39
|
+
self._vocoder = None
|
|
40
|
+
self._kwargs = kwargs
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def model_ability(self):
|
|
44
|
+
return self._model_spec.model_ability
|
|
45
|
+
|
|
46
|
+
def load(self):
|
|
47
|
+
import os
|
|
48
|
+
import sys
|
|
49
|
+
|
|
50
|
+
# The yaml config loaded from model has hard-coded the import paths. please refer to: load_hyperpyyaml
|
|
51
|
+
sys.path.insert(
|
|
52
|
+
0, os.path.join(os.path.dirname(__file__), "../../thirdparty/megatts3")
|
|
53
|
+
)
|
|
54
|
+
# For whisper
|
|
55
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../thirdparty"))
|
|
56
|
+
|
|
57
|
+
from tts.infer_cli import MegaTTS3DiTInfer
|
|
58
|
+
|
|
59
|
+
self._model = MegaTTS3DiTInfer(ckpt_root=self._model_path)
|
|
60
|
+
|
|
61
|
+
def speech(
|
|
62
|
+
self,
|
|
63
|
+
input: str,
|
|
64
|
+
voice: str,
|
|
65
|
+
response_format: str = "mp3",
|
|
66
|
+
speed: float = 1.0,
|
|
67
|
+
stream: bool = False,
|
|
68
|
+
**kwargs,
|
|
69
|
+
):
|
|
70
|
+
import soundfile
|
|
71
|
+
|
|
72
|
+
if stream:
|
|
73
|
+
raise Exception("MegaTTS3 does not support stream generation.")
|
|
74
|
+
if voice:
|
|
75
|
+
raise Exception(
|
|
76
|
+
"MegaTTS3 does not support voice, please specify prompt_speech and prompt_latent."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
prompt_speech: Optional[bytes] = kwargs.pop("prompt_speech", None)
|
|
80
|
+
prompt_latent: Optional[bytes] = kwargs.pop("prompt_latent", None)
|
|
81
|
+
if not prompt_speech:
|
|
82
|
+
raise Exception("Please set prompt_speech for MegaTTS3.")
|
|
83
|
+
if not prompt_latent:
|
|
84
|
+
raise Exception("Please set prompt_latent for MegaTTS3.")
|
|
85
|
+
|
|
86
|
+
assert self._model is not None
|
|
87
|
+
with io.BytesIO(prompt_latent) as prompt_latent_io:
|
|
88
|
+
resource_context = self._model.preprocess(
|
|
89
|
+
prompt_speech, latent_file=prompt_latent_io
|
|
90
|
+
)
|
|
91
|
+
wav_bytes = self._model.forward(
|
|
92
|
+
resource_context,
|
|
93
|
+
input,
|
|
94
|
+
time_step=kwargs.get("time_step", 32),
|
|
95
|
+
p_w=kwargs.get("p_w", 1.6),
|
|
96
|
+
t_w=kwargs.get("t_w", 2.5),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Save the generated audio
|
|
100
|
+
with BytesIO() as out:
|
|
101
|
+
with soundfile.SoundFile(
|
|
102
|
+
out, "w", self._model.sr, 1, format=response_format.upper()
|
|
103
|
+
) as f:
|
|
104
|
+
f.write(wav_bytes)
|
|
105
|
+
return out.getvalue()
|
|
@@ -203,6 +203,21 @@
|
|
|
203
203
|
"merge_length_s": 15
|
|
204
204
|
}
|
|
205
205
|
},
|
|
206
|
+
{
|
|
207
|
+
"model_name": "paraformer-zh",
|
|
208
|
+
"model_family": "funasr",
|
|
209
|
+
"model_id": "funasr/paraformer-zh",
|
|
210
|
+
"model_revision": "5ed094cdfc8f6a9b6b022bd08bc904ef862bc79e",
|
|
211
|
+
"model_ability": "audio-to-text",
|
|
212
|
+
"multilingual": false,
|
|
213
|
+
"default_model_config": {
|
|
214
|
+
"vad_model": "fsmn-vad",
|
|
215
|
+
"punc_model": "ct-punc"
|
|
216
|
+
},
|
|
217
|
+
"default_transcription_config": {
|
|
218
|
+
"batch_size_s": 300
|
|
219
|
+
}
|
|
220
|
+
},
|
|
206
221
|
{
|
|
207
222
|
"model_name": "ChatTTS",
|
|
208
223
|
"model_family": "ChatTTS",
|
|
@@ -216,7 +231,7 @@
|
|
|
216
231
|
"model_family": "CosyVoice",
|
|
217
232
|
"model_id": "FunAudioLLM/CosyVoice-300M",
|
|
218
233
|
"model_revision": "39c4e13d46bd4dfb840d214547623e5fcd2428e2",
|
|
219
|
-
"model_ability": "
|
|
234
|
+
"model_ability": "text-to-audio",
|
|
220
235
|
"multilingual": true
|
|
221
236
|
},
|
|
222
237
|
{
|
|
@@ -346,5 +361,13 @@
|
|
|
346
361
|
"model_revision": "7884269d6fd3f9beabc271b6f1308e5699281fa9",
|
|
347
362
|
"model_ability": "text-to-audio",
|
|
348
363
|
"multilingual": true
|
|
364
|
+
},
|
|
365
|
+
{
|
|
366
|
+
"model_name": "MegaTTS3",
|
|
367
|
+
"model_family": "MegaTTS",
|
|
368
|
+
"model_id": "ByteDance/MegaTTS3",
|
|
369
|
+
"model_revision": "409a7002b006d80f0730fca6f80441b08c10e738",
|
|
370
|
+
"model_ability": "text-to-audio",
|
|
371
|
+
"multilingual": true
|
|
349
372
|
}
|
|
350
373
|
]
|
|
@@ -47,6 +47,22 @@
|
|
|
47
47
|
"merge_length_s": 15
|
|
48
48
|
}
|
|
49
49
|
},
|
|
50
|
+
{
|
|
51
|
+
"model_name": "paraformer-zh",
|
|
52
|
+
"model_family": "funasr",
|
|
53
|
+
"model_hub": "modelscope",
|
|
54
|
+
"model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
|
|
55
|
+
"model_revision": "master",
|
|
56
|
+
"model_ability": "audio-to-text",
|
|
57
|
+
"multilingual": false,
|
|
58
|
+
"default_model_config": {
|
|
59
|
+
"vad_model": "fsmn-vad",
|
|
60
|
+
"punc_model": "ct-punc"
|
|
61
|
+
},
|
|
62
|
+
"default_transcription_config": {
|
|
63
|
+
"batch_size_s": 300
|
|
64
|
+
}
|
|
65
|
+
},
|
|
50
66
|
{
|
|
51
67
|
"model_name": "ChatTTS",
|
|
52
68
|
"model_family": "ChatTTS",
|
|
@@ -62,7 +78,7 @@
|
|
|
62
78
|
"model_hub": "modelscope",
|
|
63
79
|
"model_id": "iic/CosyVoice-300M",
|
|
64
80
|
"model_revision": "master",
|
|
65
|
-
"model_ability": "
|
|
81
|
+
"model_ability": "text-to-audio",
|
|
66
82
|
"multilingual": true
|
|
67
83
|
},
|
|
68
84
|
{
|
|
@@ -109,5 +125,14 @@
|
|
|
109
125
|
"model_revision": "master",
|
|
110
126
|
"model_ability": "text-to-audio",
|
|
111
127
|
"multilingual": true
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"model_name": "MegaTTS3",
|
|
131
|
+
"model_family": "MegaTTS",
|
|
132
|
+
"model_hub": "modelscope",
|
|
133
|
+
"model_id": "ByteDance/MegaTTS3",
|
|
134
|
+
"model_revision": "master",
|
|
135
|
+
"model_ability": "text-to-audio",
|
|
136
|
+
"multilingual": true
|
|
112
137
|
}
|
|
113
138
|
]
|
xinference/model/core.py
CHANGED
|
@@ -30,6 +30,11 @@ class ModelDescription(ABC):
|
|
|
30
30
|
self.devices = devices
|
|
31
31
|
self._model_path = model_path
|
|
32
32
|
|
|
33
|
+
@property
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def spec(self):
|
|
36
|
+
pass
|
|
37
|
+
|
|
33
38
|
def to_dict(self):
|
|
34
39
|
"""
|
|
35
40
|
Return a dict to describe some information about model.
|
|
@@ -155,3 +160,12 @@ class CacheableModelSpec(BaseModel):
|
|
|
155
160
|
model_id: str
|
|
156
161
|
model_revision: Optional[str]
|
|
157
162
|
model_hub: str = "huggingface"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class VirtualEnvSettings(BaseModel):
|
|
166
|
+
packages: List[str]
|
|
167
|
+
inherit_pip_config: bool = True
|
|
168
|
+
index_url: Optional[str] = None
|
|
169
|
+
extra_index_url: Optional[str] = None
|
|
170
|
+
find_links: Optional[str] = None
|
|
171
|
+
trusted_host: Optional[str] = None
|
|
@@ -24,7 +24,7 @@ import torch
|
|
|
24
24
|
from ..._compat import ROOT_KEY, ErrorWrapper, ValidationError
|
|
25
25
|
from ...device_utils import empty_cache
|
|
26
26
|
from ...types import Embedding, EmbeddingData, EmbeddingUsage
|
|
27
|
-
from ..core import CacheableModelSpec, ModelDescription
|
|
27
|
+
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
28
28
|
from ..utils import get_cache_dir, is_model_cached
|
|
29
29
|
|
|
30
30
|
logger = logging.getLogger(__name__)
|
|
@@ -57,6 +57,7 @@ class EmbeddingModelSpec(CacheableModelSpec):
|
|
|
57
57
|
model_id: str
|
|
58
58
|
model_revision: Optional[str]
|
|
59
59
|
model_hub: str = "huggingface"
|
|
60
|
+
virtualenv: Optional[VirtualEnvSettings]
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
class EmbeddingModelDescription(ModelDescription):
|
|
@@ -70,6 +71,10 @@ class EmbeddingModelDescription(ModelDescription):
|
|
|
70
71
|
super().__init__(address, devices, model_path=model_path)
|
|
71
72
|
self._model_spec = model_spec
|
|
72
73
|
|
|
74
|
+
@property
|
|
75
|
+
def spec(self):
|
|
76
|
+
return self._model_spec
|
|
77
|
+
|
|
73
78
|
def to_dict(self):
|
|
74
79
|
return {
|
|
75
80
|
"model_type": "embedding",
|
|
@@ -20,7 +20,7 @@ from threading import Lock
|
|
|
20
20
|
from typing import Dict, List, Optional, Tuple
|
|
21
21
|
|
|
22
22
|
from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
|
|
23
|
-
from ..core import CacheableModelSpec, ModelDescription
|
|
23
|
+
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
24
24
|
from .utils import get_launcher
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
@@ -34,6 +34,7 @@ class FlexibleModelSpec(CacheableModelSpec):
|
|
|
34
34
|
model_uri: Optional[str]
|
|
35
35
|
launcher: str
|
|
36
36
|
launcher_args: Optional[str]
|
|
37
|
+
virtualenv: Optional[VirtualEnvSettings]
|
|
37
38
|
|
|
38
39
|
def parser_args(self):
|
|
39
40
|
return json.loads(self.launcher_args)
|
|
@@ -50,6 +51,10 @@ class FlexibleModelDescription(ModelDescription):
|
|
|
50
51
|
super().__init__(address, devices, model_path=model_path)
|
|
51
52
|
self._model_spec = model_spec
|
|
52
53
|
|
|
54
|
+
@property
|
|
55
|
+
def spec(self):
|
|
56
|
+
return self._model_spec
|
|
57
|
+
|
|
53
58
|
def to_dict(self):
|
|
54
59
|
return {
|
|
55
60
|
"model_type": "flexible",
|
xinference/model/image/core.py
CHANGED
|
@@ -21,7 +21,7 @@ from typing import Dict, List, Literal, Optional, Tuple, Union
|
|
|
21
21
|
|
|
22
22
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
23
23
|
from ...types import PeftModelConfig
|
|
24
|
-
from ..core import CacheableModelSpec, ModelDescription
|
|
24
|
+
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
25
25
|
from ..utils import (
|
|
26
26
|
IS_NEW_HUGGINGFACE_HUB,
|
|
27
27
|
retry_download,
|
|
@@ -59,6 +59,7 @@ class ImageModelFamilyV1(CacheableModelSpec):
|
|
|
59
59
|
gguf_model_id: Optional[str]
|
|
60
60
|
gguf_quantizations: Optional[List[str]]
|
|
61
61
|
gguf_model_file_name_template: Optional[str]
|
|
62
|
+
virtualenv: Optional[VirtualEnvSettings]
|
|
62
63
|
|
|
63
64
|
|
|
64
65
|
class ImageModelDescription(ModelDescription):
|
|
@@ -72,6 +73,10 @@ class ImageModelDescription(ModelDescription):
|
|
|
72
73
|
super().__init__(address, devices, model_path=model_path)
|
|
73
74
|
self._model_spec = model_spec
|
|
74
75
|
|
|
76
|
+
@property
|
|
77
|
+
def spec(self):
|
|
78
|
+
return self._model_spec
|
|
79
|
+
|
|
75
80
|
def to_dict(self):
|
|
76
81
|
if self._model_spec.controlnet is not None:
|
|
77
82
|
controlnet = [cn.dict() for cn in self._model_spec.controlnet]
|
|
@@ -339,6 +339,22 @@
|
|
|
339
339
|
"model_revision": "cf6b7386bc89a54f09785612ba74cb12de6fa17c",
|
|
340
340
|
"model_ability": [
|
|
341
341
|
"ocr"
|
|
342
|
-
]
|
|
342
|
+
],
|
|
343
|
+
"virtualenv": {
|
|
344
|
+
"packages": [
|
|
345
|
+
"transformers==4.37.2",
|
|
346
|
+
"httpx==0.24.0",
|
|
347
|
+
"deepspeed==0.12.3",
|
|
348
|
+
"peft==0.4.0",
|
|
349
|
+
"tiktoken==0.6.0",
|
|
350
|
+
"bitsandbytes==0.41.0",
|
|
351
|
+
"scikit-learn==1.2.2",
|
|
352
|
+
"sentencepiece==0.1.99",
|
|
353
|
+
"einops==0.6.1",
|
|
354
|
+
"einops-exts==0.0.4",
|
|
355
|
+
"timm==0.6.13",
|
|
356
|
+
"numpy==1.26.4"
|
|
357
|
+
]
|
|
358
|
+
}
|
|
343
359
|
}
|
|
344
360
|
]
|
|
@@ -315,6 +315,22 @@
|
|
|
315
315
|
"model_hub": "modelscope",
|
|
316
316
|
"model_ability": [
|
|
317
317
|
"ocr"
|
|
318
|
-
]
|
|
318
|
+
],
|
|
319
|
+
"virtualenv": {
|
|
320
|
+
"packages": [
|
|
321
|
+
"transformers==4.37.2",
|
|
322
|
+
"httpx==0.24.0",
|
|
323
|
+
"deepspeed==0.12.3",
|
|
324
|
+
"peft==0.4.0",
|
|
325
|
+
"tiktoken==0.6.0",
|
|
326
|
+
"bitsandbytes==0.41.0",
|
|
327
|
+
"scikit-learn==1.2.2",
|
|
328
|
+
"sentencepiece==0.1.99",
|
|
329
|
+
"einops==0.6.1",
|
|
330
|
+
"einops-exts==0.0.4",
|
|
331
|
+
"timm==0.6.13",
|
|
332
|
+
"numpy==1.26.4"
|
|
333
|
+
]
|
|
334
|
+
}
|
|
319
335
|
}
|
|
320
336
|
]
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -132,7 +132,7 @@ def _install():
|
|
|
132
132
|
from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
|
|
133
133
|
from .lmdeploy.core import LMDeployChatModel, LMDeployModel
|
|
134
134
|
from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
|
|
135
|
-
from .sglang.core import SGLANGChatModel, SGLANGModel
|
|
135
|
+
from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
|
|
136
136
|
from .transformers.chatglm import ChatglmPytorchChatModel
|
|
137
137
|
from .transformers.cogagent import CogAgentChatModel
|
|
138
138
|
from .transformers.cogvlm2 import CogVLM2Model
|
|
@@ -143,16 +143,15 @@ def _install():
|
|
|
143
143
|
DeepSeekV2PytorchModel,
|
|
144
144
|
)
|
|
145
145
|
from .transformers.deepseek_vl import DeepSeekVLChatModel
|
|
146
|
+
from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
|
|
146
147
|
from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
|
|
147
148
|
from .transformers.glm4v import Glm4VModel
|
|
148
149
|
from .transformers.glm_edge_v import GlmEdgeVModel
|
|
149
|
-
from .transformers.intern_vl import InternVLChatModel
|
|
150
150
|
from .transformers.internlm2 import Internlm2PytorchChatModel
|
|
151
151
|
from .transformers.minicpmv25 import MiniCPMV25Model
|
|
152
152
|
from .transformers.minicpmv26 import MiniCPMV26Model
|
|
153
153
|
from .transformers.opt import OptPytorchModel
|
|
154
154
|
from .transformers.qwen2_audio import Qwen2AudioChatModel
|
|
155
|
-
from .transformers.qwen2_vl import Qwen2VLChatModel
|
|
156
155
|
from .transformers.qwen_vl import QwenVLChatModel
|
|
157
156
|
from .transformers.yi_vl import YiVLChatModel
|
|
158
157
|
from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
|
|
@@ -173,7 +172,7 @@ def _install():
|
|
|
173
172
|
XllamaCppModel,
|
|
174
173
|
]
|
|
175
174
|
)
|
|
176
|
-
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
|
175
|
+
SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
|
|
177
176
|
VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
|
|
178
177
|
MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
|
|
179
178
|
LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
|
|
@@ -183,11 +182,10 @@ def _install():
|
|
|
183
182
|
PytorchChatModel,
|
|
184
183
|
Internlm2PytorchChatModel,
|
|
185
184
|
QwenVLChatModel,
|
|
186
|
-
Qwen2VLChatModel,
|
|
187
185
|
Qwen2AudioChatModel,
|
|
188
186
|
YiVLChatModel,
|
|
189
187
|
DeepSeekVLChatModel,
|
|
190
|
-
|
|
188
|
+
DeepSeekVL2ChatModel,
|
|
191
189
|
PytorchModel,
|
|
192
190
|
CogVLM2Model,
|
|
193
191
|
CogVLM2VideoModel,
|
xinference/model/llm/core.py
CHANGED
|
@@ -54,6 +54,7 @@ class LLM(abc.ABC):
|
|
|
54
54
|
**kwargs,
|
|
55
55
|
):
|
|
56
56
|
self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
|
|
57
|
+
self.raw_model_uid = replica_model_uid
|
|
57
58
|
self.model_family = model_family
|
|
58
59
|
self.model_spec = model_spec
|
|
59
60
|
self.quantization = quantization
|
|
@@ -143,6 +144,10 @@ class LLMDescription(ModelDescription):
|
|
|
143
144
|
self._llm_spec = llm_spec
|
|
144
145
|
self._quantization = quantization
|
|
145
146
|
|
|
147
|
+
@property
|
|
148
|
+
def spec(self):
|
|
149
|
+
return self._llm_family
|
|
150
|
+
|
|
146
151
|
def to_dict(self):
|
|
147
152
|
return {
|
|
148
153
|
"model_type": "LLM",
|
|
@@ -36,7 +36,7 @@ from ..utils import DEEPSEEK_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelM
|
|
|
36
36
|
|
|
37
37
|
logger = logging.getLogger(__name__)
|
|
38
38
|
|
|
39
|
-
USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP",
|
|
39
|
+
USE_XLLAMACPP = bool(int(os.environ.get("USE_XLLAMACPP", 1)))
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
class _Done:
|
|
@@ -142,24 +142,38 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
142
142
|
|
|
143
143
|
if os.path.isfile(self.model_path):
|
|
144
144
|
# mostly passed from --model_path
|
|
145
|
-
model_path =
|
|
145
|
+
model_path = self.model_path
|
|
146
146
|
else:
|
|
147
147
|
# handle legacy cache.
|
|
148
|
-
|
|
149
|
-
|
|
148
|
+
if (
|
|
149
|
+
self.model_spec.model_file_name_split_template
|
|
150
|
+
and self.model_spec.quantization_parts
|
|
151
|
+
):
|
|
152
|
+
part = self.model_spec.quantization_parts[self.quantization]
|
|
153
|
+
model_path = os.path.join(
|
|
154
|
+
self.model_path,
|
|
155
|
+
self.model_spec.model_file_name_split_template.format(
|
|
156
|
+
quantization=self.quantization, part=part[0]
|
|
157
|
+
),
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
model_path = os.path.join(
|
|
150
161
|
self.model_path,
|
|
151
162
|
self.model_spec.model_file_name_template.format(
|
|
152
163
|
quantization=self.quantization
|
|
153
164
|
),
|
|
154
165
|
)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
model_path = legacy_model_file_path
|
|
166
|
+
legacy_model_file_path = os.path.join(self.model_path, "model.bin")
|
|
167
|
+
if os.path.exists(legacy_model_file_path):
|
|
168
|
+
model_path = legacy_model_file_path
|
|
159
169
|
|
|
160
170
|
try:
|
|
161
171
|
params = CommonParams()
|
|
162
|
-
|
|
172
|
+
# Compatible with xllamacpp changes
|
|
173
|
+
try:
|
|
174
|
+
params.model = model_path
|
|
175
|
+
except Exception:
|
|
176
|
+
params.model.path = model_path
|
|
163
177
|
if self.model_family.chat_template:
|
|
164
178
|
params.chat_template = self.model_family.chat_template
|
|
165
179
|
# This is the default value, could be overwritten by _llamacpp_model_config
|
|
@@ -302,7 +316,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
|
|
|
302
316
|
while (r := q.get()) is not _Done:
|
|
303
317
|
if type(r) is _Error:
|
|
304
318
|
raise Exception("Got error in chat stream: %s", r.msg)
|
|
305
|
-
|
|
319
|
+
# Get valid keys (O(1) lookup)
|
|
320
|
+
chunk_keys = ChatCompletionChunk.__annotations__
|
|
321
|
+
# The chunk may contain additional keys (e.g., system_fingerprint),
|
|
322
|
+
# which might not conform to OpenAI/DeepSeek formats.
|
|
323
|
+
# Filter out keys that are not part of ChatCompletionChunk.
|
|
324
|
+
yield {key: r[key] for key in chunk_keys if key in r}
|
|
306
325
|
|
|
307
326
|
return self._to_chat_completion_chunks(
|
|
308
327
|
_to_iterator(), self.reasoning_parser
|
|
@@ -410,20 +429,30 @@ class LlamaCppModel(LLM):
|
|
|
410
429
|
|
|
411
430
|
if os.path.isfile(self.model_path):
|
|
412
431
|
# mostly passed from --model_path
|
|
413
|
-
model_path =
|
|
432
|
+
model_path = self.model_path
|
|
414
433
|
else:
|
|
415
434
|
# handle legacy cache.
|
|
416
|
-
|
|
417
|
-
|
|
435
|
+
if (
|
|
436
|
+
self.model_spec.model_file_name_split_template
|
|
437
|
+
and self.model_spec.quantization_parts
|
|
438
|
+
):
|
|
439
|
+
part = self.model_spec.quantization_parts[self.quantization]
|
|
440
|
+
model_path = os.path.join(
|
|
441
|
+
self.model_path,
|
|
442
|
+
self.model_spec.model_file_name_split_template.format(
|
|
443
|
+
quantization=self.quantization, part=part[0]
|
|
444
|
+
),
|
|
445
|
+
)
|
|
446
|
+
else:
|
|
447
|
+
model_path = os.path.join(
|
|
418
448
|
self.model_path,
|
|
419
449
|
self.model_spec.model_file_name_template.format(
|
|
420
450
|
quantization=self.quantization
|
|
421
451
|
),
|
|
422
452
|
)
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
model_path = legacy_model_file_path
|
|
453
|
+
legacy_model_file_path = os.path.join(self.model_path, "model.bin")
|
|
454
|
+
if os.path.exists(legacy_model_file_path):
|
|
455
|
+
model_path = legacy_model_file_path
|
|
427
456
|
|
|
428
457
|
try:
|
|
429
458
|
self._llm = Llama(
|