xinference 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +50 -1
- xinference/client/restful/restful_client.py +82 -2
- xinference/constants.py +3 -0
- xinference/core/chat_interface.py +297 -83
- xinference/core/model.py +1 -0
- xinference/core/progress_tracker.py +16 -8
- xinference/core/supervisor.py +45 -1
- xinference/core/worker.py +262 -37
- xinference/deploy/cmdline.py +33 -1
- xinference/model/audio/core.py +11 -1
- xinference/model/audio/megatts.py +105 -0
- xinference/model/audio/model_spec.json +24 -1
- xinference/model/audio/model_spec_modelscope.json +26 -1
- xinference/model/core.py +14 -0
- xinference/model/embedding/core.py +6 -1
- xinference/model/flexible/core.py +6 -1
- xinference/model/image/core.py +6 -1
- xinference/model/image/model_spec.json +17 -1
- xinference/model/image/model_spec_modelscope.json +17 -1
- xinference/model/llm/__init__.py +0 -4
- xinference/model/llm/core.py +4 -0
- xinference/model/llm/llama_cpp/core.py +40 -16
- xinference/model/llm/llm_family.json +413 -84
- xinference/model/llm/llm_family.py +24 -1
- xinference/model/llm/llm_family_modelscope.json +447 -0
- xinference/model/llm/mlx/core.py +16 -2
- xinference/model/llm/transformers/__init__.py +14 -0
- xinference/model/llm/transformers/core.py +30 -6
- xinference/model/llm/transformers/gemma3.py +17 -2
- xinference/model/llm/transformers/intern_vl.py +28 -18
- xinference/model/llm/transformers/minicpmv26.py +21 -2
- xinference/model/llm/transformers/qwen-omni.py +308 -0
- xinference/model/llm/transformers/qwen2_audio.py +1 -1
- xinference/model/llm/transformers/qwen2_vl.py +20 -4
- xinference/model/llm/utils.py +11 -1
- xinference/model/llm/vllm/core.py +35 -0
- xinference/model/llm/vllm/distributed_executor.py +8 -2
- xinference/model/rerank/core.py +6 -1
- xinference/model/utils.py +118 -1
- xinference/model/video/core.py +6 -1
- xinference/thirdparty/megatts3/__init__.py +0 -0
- xinference/thirdparty/megatts3/tts/frontend_function.py +175 -0
- xinference/thirdparty/megatts3/tts/gradio_api.py +93 -0
- xinference/thirdparty/megatts3/tts/infer_cli.py +277 -0
- xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py +318 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/ar_dur_predictor.py +362 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/layers.py +64 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/nar_tts_modules.py +73 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rel_transformer.py +403 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rot_transformer.py +649 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/seq_utils.py +342 -0
- xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/transformer.py +767 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/cfm.py +309 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/dit.py +180 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/time_embedding.py +44 -0
- xinference/thirdparty/megatts3/tts/modules/llm_dit/transformer.py +230 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/diag_gaussian.py +67 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/hifigan_modules.py +283 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/seanet_encoder.py +38 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/wavvae_v3.py +60 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/conv.py +154 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/lstm.py +51 -0
- xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/seanet.py +126 -0
- xinference/thirdparty/megatts3/tts/utils/audio_utils/align.py +36 -0
- xinference/thirdparty/megatts3/tts/utils/audio_utils/io.py +95 -0
- xinference/thirdparty/megatts3/tts/utils/audio_utils/plot.py +90 -0
- xinference/thirdparty/megatts3/tts/utils/commons/ckpt_utils.py +171 -0
- xinference/thirdparty/megatts3/tts/utils/commons/hparams.py +215 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/dict.json +1 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/ph_tone_convert.py +94 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/split_text.py +90 -0
- xinference/thirdparty/megatts3/tts/utils/text_utils/text_encoder.py +280 -0
- xinference/types.py +10 -0
- xinference/utils.py +54 -0
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.0f6523be.css +2 -0
- xinference/web/ui/build/static/css/main.0f6523be.css.map +1 -0
- xinference/web/ui/build/static/js/main.58bd483c.js +3 -0
- xinference/web/ui/build/static/js/main.58bd483c.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3bff8cbe9141f937f4d98879a9771b0f48e0e4e0dbee8e647adbfe23859e7048.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4500b1a622a031011f0a291701e306b87e08cbc749c50e285103536b85b6a914.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/69081049f0c7447544b7cfd73dd13d8846c02fe5febe4d81587e95c89a412d5b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bf2b211b0d1b6465eff512d64c869d748f803c5651a7c24e48de6ea3484a7bfe.json +1 -0
- xinference/web/ui/src/locales/en.json +2 -1
- xinference/web/ui/src/locales/zh.json +2 -1
- {xinference-1.4.1.dist-info → xinference-1.5.0.dist-info}/METADATA +127 -114
- {xinference-1.4.1.dist-info → xinference-1.5.0.dist-info}/RECORD +96 -60
- {xinference-1.4.1.dist-info → xinference-1.5.0.dist-info}/WHEEL +1 -1
- xinference/web/ui/build/static/css/main.b494ae7e.css +0 -2
- xinference/web/ui/build/static/css/main.b494ae7e.css.map +0 -1
- xinference/web/ui/build/static/js/main.5ca4eea1.js +0 -3
- xinference/web/ui/build/static/js/main.5ca4eea1.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f0967acaec5df1d45b80010949c258d64297ebbb0f44b8bb3afcbd45c6f0ec4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/68249645124f37d01eef83b1d897e751f895bea919b6fb466f907c1f87cebc84.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +0 -1
- /xinference/web/ui/build/static/js/{main.5ca4eea1.js.LICENSE.txt → main.58bd483c.js.LICENSE.txt} +0 -0
- {xinference-1.4.1.dist-info → xinference-1.5.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.4.1.dist-info → xinference-1.5.0.dist-info/licenses}/LICENSE +0 -0
- {xinference-1.4.1.dist-info → xinference-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -19,14 +19,14 @@ from typing import Dict, Iterator, List, Optional, Union
|
|
|
19
19
|
import torch
|
|
20
20
|
|
|
21
21
|
from ....types import ChatCompletion, ChatCompletionChunk
|
|
22
|
-
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
22
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
23
23
|
from ..utils import (
|
|
24
24
|
_decode_image,
|
|
25
25
|
generate_chat_completion,
|
|
26
26
|
generate_completion_chunk,
|
|
27
27
|
parse_messages,
|
|
28
28
|
)
|
|
29
|
-
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
29
|
+
from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
|
|
30
30
|
from .utils import cache_clean
|
|
31
31
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
@@ -232,6 +232,10 @@ def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=
|
|
|
232
232
|
return pixel_values, num_patches_list
|
|
233
233
|
|
|
234
234
|
|
|
235
|
+
@register_transformer
|
|
236
|
+
@register_non_default_model(
|
|
237
|
+
"internvl-chat", "internvl2", "Internvl2.5", "Internvl2.5-MPO", "InternVL3"
|
|
238
|
+
)
|
|
235
239
|
class InternVLChatModel(PytorchChatModel):
|
|
236
240
|
def __init__(self, *args, **kwargs):
|
|
237
241
|
super().__init__(*args, **kwargs)
|
|
@@ -257,6 +261,8 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
257
261
|
def _split_model(self):
|
|
258
262
|
import math
|
|
259
263
|
|
|
264
|
+
from transformers import AutoConfig
|
|
265
|
+
|
|
260
266
|
device_map = {}
|
|
261
267
|
world_size = torch.cuda.device_count()
|
|
262
268
|
# single gpu
|
|
@@ -265,22 +271,26 @@ class InternVLChatModel(PytorchChatModel):
|
|
|
265
271
|
model_size = f"{self.model_spec.model_size_in_billions}B"
|
|
266
272
|
model_name = self.model_family.model_name.lower().replace("-mpo", "")
|
|
267
273
|
model_name = f"{model_name}-{model_size}"
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
274
|
+
if "internvl3" in model_name.lower():
|
|
275
|
+
config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
|
|
276
|
+
num_layers = config.llm_config.num_hidden_layers
|
|
277
|
+
else:
|
|
278
|
+
num_layers = {
|
|
279
|
+
"internvl2-1B": 24,
|
|
280
|
+
"internvl2-2B": 24,
|
|
281
|
+
"internvl2-4B": 32,
|
|
282
|
+
"internvl2-8B": 32,
|
|
283
|
+
"internvl2-26B": 48,
|
|
284
|
+
"internvl2-40B": 60,
|
|
285
|
+
"internvl2-76B": 80,
|
|
286
|
+
"internvl2.5-1B": 24,
|
|
287
|
+
"internvl2.5-2B": 24,
|
|
288
|
+
"internvl2.5-4B": 36,
|
|
289
|
+
"internvl2.5-8B": 32,
|
|
290
|
+
"internvl2.5-26B": 48,
|
|
291
|
+
"internvl2.5-38B": 64,
|
|
292
|
+
"internvl2.5-78B": 80,
|
|
293
|
+
}[model_name]
|
|
284
294
|
|
|
285
295
|
# Since the first GPU will be used for ViT, treat it as half a GPU.
|
|
286
296
|
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
|
|
@@ -20,7 +20,12 @@ import torch
|
|
|
20
20
|
from PIL import Image
|
|
21
21
|
|
|
22
22
|
from ....core.scheduler import InferenceRequest
|
|
23
|
-
from ....types import
|
|
23
|
+
from ....types import (
|
|
24
|
+
ChatCompletion,
|
|
25
|
+
ChatCompletionChunk,
|
|
26
|
+
CompletionChunk,
|
|
27
|
+
PytorchModelConfig,
|
|
28
|
+
)
|
|
24
29
|
from ...utils import select_device
|
|
25
30
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
26
31
|
from ..utils import (
|
|
@@ -52,6 +57,15 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
52
57
|
return True
|
|
53
58
|
return False
|
|
54
59
|
|
|
60
|
+
def _sanitize_model_config(
|
|
61
|
+
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
62
|
+
) -> PytorchModelConfig:
|
|
63
|
+
pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
|
|
64
|
+
assert pytorch_model_config is not None
|
|
65
|
+
pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
|
|
66
|
+
pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
|
|
67
|
+
return pytorch_model_config
|
|
68
|
+
|
|
55
69
|
def _get_model_class(self):
|
|
56
70
|
from transformers import AutoModel
|
|
57
71
|
|
|
@@ -99,8 +113,13 @@ class MiniCPMV26Model(PytorchChatModel):
|
|
|
99
113
|
self.model_path,
|
|
100
114
|
trust_remote_code=True,
|
|
101
115
|
)
|
|
116
|
+
min_pixels = self._pytorch_model_config.get("min_pixels")
|
|
117
|
+
max_pixels = self._pytorch_model_config.get("max_pixels")
|
|
102
118
|
self._processor = AutoProcessor.from_pretrained(
|
|
103
|
-
self.model_path,
|
|
119
|
+
self.model_path,
|
|
120
|
+
trust_remote_code=True,
|
|
121
|
+
min_pixels=min_pixels,
|
|
122
|
+
max_pixels=max_pixels,
|
|
104
123
|
)
|
|
105
124
|
self._device = self._model.device
|
|
106
125
|
self._save_tensorizer()
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import importlib.util
|
|
17
|
+
import io
|
|
18
|
+
import logging
|
|
19
|
+
import sys
|
|
20
|
+
import time
|
|
21
|
+
import uuid
|
|
22
|
+
from typing import Dict, Iterator, List, Optional, Union
|
|
23
|
+
|
|
24
|
+
from ....model.utils import select_device
|
|
25
|
+
from ....types import (
|
|
26
|
+
ChatCompletion,
|
|
27
|
+
ChatCompletionAudio,
|
|
28
|
+
ChatCompletionChoice,
|
|
29
|
+
ChatCompletionChunk,
|
|
30
|
+
ChatCompletionMessage,
|
|
31
|
+
CompletionChunk,
|
|
32
|
+
CompletionUsage,
|
|
33
|
+
)
|
|
34
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
35
|
+
from ..utils import generate_completion_chunk
|
|
36
|
+
from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
|
|
37
|
+
from .utils import cache_clean
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
DEFAULT_SYSTEM_PROMPT = (
|
|
42
|
+
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
|
|
43
|
+
"capable of perceiving auditory and visual inputs, as well as generating text and speech."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@register_transformer
|
|
48
|
+
@register_non_default_model("qwen2.5-omni")
|
|
49
|
+
class Qwen2_5OmniChatModel(PytorchChatModel):
|
|
50
|
+
def __init__(self, *args, **kwargs):
|
|
51
|
+
super().__init__(*args, **kwargs)
|
|
52
|
+
|
|
53
|
+
self._tokenizer = None
|
|
54
|
+
self._model = None
|
|
55
|
+
self._device = None
|
|
56
|
+
self._processor = None
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def match(
|
|
60
|
+
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
61
|
+
) -> bool:
|
|
62
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
63
|
+
return False
|
|
64
|
+
llm_family = model_family.model_family or model_family.model_name
|
|
65
|
+
if "qwen2.5-omni".lower() in llm_family.lower():
|
|
66
|
+
return True
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def load(self):
|
|
70
|
+
from transformers import (
|
|
71
|
+
Qwen2_5OmniForConditionalGeneration,
|
|
72
|
+
Qwen2_5OmniProcessor,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
device = self._pytorch_model_config.get("device", "auto")
|
|
76
|
+
device = select_device(device)
|
|
77
|
+
self._device = device
|
|
78
|
+
# for multiple GPU, set back to auto to make multiple devices work
|
|
79
|
+
device = "auto" if device == "cuda" else device
|
|
80
|
+
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
81
|
+
kwargs = (
|
|
82
|
+
{}
|
|
83
|
+
if not flash_attn_installed
|
|
84
|
+
else {"attn_implementation": "flash_attention_2"}
|
|
85
|
+
)
|
|
86
|
+
logger.debug("Loading model with extra kwargs: %s", kwargs)
|
|
87
|
+
|
|
88
|
+
self._processor = Qwen2_5OmniProcessor.from_pretrained(
|
|
89
|
+
self.model_path, trust_remote_code=True
|
|
90
|
+
)
|
|
91
|
+
self._tokenizer = self._processor.tokenizer
|
|
92
|
+
self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
|
|
93
|
+
self.model_path,
|
|
94
|
+
torch_dtype="auto",
|
|
95
|
+
device_map=device,
|
|
96
|
+
trust_remote_code=True,
|
|
97
|
+
**kwargs,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@cache_clean
|
|
101
|
+
def chat(
|
|
102
|
+
self,
|
|
103
|
+
messages: List[Dict],
|
|
104
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
105
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
106
|
+
messages = self._transform_messages(messages)
|
|
107
|
+
|
|
108
|
+
generate_config = generate_config if generate_config else {}
|
|
109
|
+
|
|
110
|
+
stream = generate_config.get("stream", False) if generate_config else False
|
|
111
|
+
|
|
112
|
+
if stream:
|
|
113
|
+
it = self._generate_stream(messages, generate_config)
|
|
114
|
+
return self._to_chat_completion_chunks(it)
|
|
115
|
+
else:
|
|
116
|
+
c = self._generate(messages, generate_config)
|
|
117
|
+
return c
|
|
118
|
+
|
|
119
|
+
def _transform_messages(
|
|
120
|
+
self,
|
|
121
|
+
messages: Union[List[ChatCompletionMessage], List[dict]],
|
|
122
|
+
):
|
|
123
|
+
messages = super()._transform_messages(messages)
|
|
124
|
+
if messages[0]["role"] != "system":
|
|
125
|
+
messages.insert(
|
|
126
|
+
0,
|
|
127
|
+
{
|
|
128
|
+
"role": "system",
|
|
129
|
+
"content": [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}], # type: ignore
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
logger.debug("Force to set system prompt")
|
|
134
|
+
messages[0]["content"] = [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}] # type: ignore
|
|
135
|
+
return messages
|
|
136
|
+
|
|
137
|
+
def _generate(
|
|
138
|
+
self, messages: List, config: PytorchGenerateConfig = {}
|
|
139
|
+
) -> ChatCompletion:
|
|
140
|
+
import soundfile as sf
|
|
141
|
+
from qwen_omni_utils import process_mm_info
|
|
142
|
+
|
|
143
|
+
use_audio_in_video = config.get("use_audio_in_video", True)
|
|
144
|
+
voice = config.get("voice", "Chelsie")
|
|
145
|
+
|
|
146
|
+
text = self._processor.apply_chat_template(
|
|
147
|
+
messages, tokenize=False, add_generation_prompt=True
|
|
148
|
+
)
|
|
149
|
+
audios, images, videos = process_mm_info(
|
|
150
|
+
messages, use_audio_in_video=use_audio_in_video
|
|
151
|
+
)
|
|
152
|
+
logger.debug(
|
|
153
|
+
"Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
|
|
154
|
+
)
|
|
155
|
+
inputs = self._processor(
|
|
156
|
+
text=text,
|
|
157
|
+
images=images,
|
|
158
|
+
audio=audios,
|
|
159
|
+
videos=videos,
|
|
160
|
+
padding=True,
|
|
161
|
+
return_tensors="pt",
|
|
162
|
+
use_audio_in_video=use_audio_in_video,
|
|
163
|
+
)
|
|
164
|
+
inputs = inputs.to(self._device)
|
|
165
|
+
|
|
166
|
+
# Inference: Generation of the output
|
|
167
|
+
generated_ids, audio = self._model.generate(
|
|
168
|
+
**inputs,
|
|
169
|
+
speaker=voice,
|
|
170
|
+
max_new_tokens=config.get("max_tokens", 512),
|
|
171
|
+
temperature=config.get("temperature", 1),
|
|
172
|
+
use_audio_in_video=use_audio_in_video,
|
|
173
|
+
)
|
|
174
|
+
generated_ids_trimmed = [
|
|
175
|
+
out_ids[len(in_ids) :]
|
|
176
|
+
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
177
|
+
]
|
|
178
|
+
output_text = self._processor.batch_decode(
|
|
179
|
+
generated_ids_trimmed,
|
|
180
|
+
skip_special_tokens=True,
|
|
181
|
+
clean_up_tokenization_spaces=False,
|
|
182
|
+
)[0]
|
|
183
|
+
|
|
184
|
+
wav_io = io.BytesIO()
|
|
185
|
+
sf.write(
|
|
186
|
+
wav_io,
|
|
187
|
+
audio.reshape(-1).detach().cpu().numpy(),
|
|
188
|
+
samplerate=24000,
|
|
189
|
+
format="WAV",
|
|
190
|
+
)
|
|
191
|
+
wav_bytes = wav_io.getvalue()
|
|
192
|
+
audio_content = base64.b64encode(wav_bytes).decode()
|
|
193
|
+
|
|
194
|
+
return ChatCompletion(
|
|
195
|
+
id="chat" + str(uuid.uuid1()),
|
|
196
|
+
object="chat.completion",
|
|
197
|
+
created=int(time.time()),
|
|
198
|
+
model=self.model_uid,
|
|
199
|
+
choices=[
|
|
200
|
+
ChatCompletionChoice(
|
|
201
|
+
index=0,
|
|
202
|
+
message={
|
|
203
|
+
"role": "assistant",
|
|
204
|
+
"content": output_text,
|
|
205
|
+
"audio": ChatCompletionAudio(
|
|
206
|
+
id="audio" + str(uuid.uuid1()),
|
|
207
|
+
data=audio_content,
|
|
208
|
+
expires_at=int(time.time()),
|
|
209
|
+
transcript="",
|
|
210
|
+
),
|
|
211
|
+
},
|
|
212
|
+
finish_reason="stop",
|
|
213
|
+
)
|
|
214
|
+
],
|
|
215
|
+
usage=CompletionUsage(
|
|
216
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
217
|
+
),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def _generate_stream(
|
|
221
|
+
self, messages: List, config: PytorchGenerateConfig = {}
|
|
222
|
+
) -> Iterator[CompletionChunk]:
|
|
223
|
+
from threading import Thread
|
|
224
|
+
|
|
225
|
+
from qwen_omni_utils import process_mm_info
|
|
226
|
+
from transformers import TextIteratorStreamer
|
|
227
|
+
|
|
228
|
+
use_audio_in_video = config.get("use_audio_in_video", True)
|
|
229
|
+
voice = config.get("voice", "Chelsie")
|
|
230
|
+
|
|
231
|
+
text = self._processor.apply_chat_template(
|
|
232
|
+
messages, tokenize=False, add_generation_prompt=True
|
|
233
|
+
)
|
|
234
|
+
audios, images, videos = process_mm_info(
|
|
235
|
+
messages, use_audio_in_video=use_audio_in_video
|
|
236
|
+
)
|
|
237
|
+
logger.debug(
|
|
238
|
+
"Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
|
|
239
|
+
)
|
|
240
|
+
inputs = self._processor(
|
|
241
|
+
text=text,
|
|
242
|
+
images=images,
|
|
243
|
+
audio=audios,
|
|
244
|
+
videos=videos,
|
|
245
|
+
padding=True,
|
|
246
|
+
return_tensors="pt",
|
|
247
|
+
use_audio_in_video=use_audio_in_video,
|
|
248
|
+
)
|
|
249
|
+
inputs = inputs.to(self._device)
|
|
250
|
+
|
|
251
|
+
tokenizer = self._tokenizer
|
|
252
|
+
streamer = TextIteratorStreamer(
|
|
253
|
+
tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# TODO(xuye): Cannot find a way to streaming output,
|
|
257
|
+
# will implement it when it's supported
|
|
258
|
+
|
|
259
|
+
gen_kwargs = {
|
|
260
|
+
"max_new_tokens": config.get("max_tokens", 512),
|
|
261
|
+
"temperature": config.get("temperature", 1),
|
|
262
|
+
"streamer": streamer,
|
|
263
|
+
"speaker": voice,
|
|
264
|
+
**inputs,
|
|
265
|
+
}
|
|
266
|
+
error = None
|
|
267
|
+
|
|
268
|
+
def model_generate():
|
|
269
|
+
try:
|
|
270
|
+
return self._model.generate(**gen_kwargs)
|
|
271
|
+
except Exception:
|
|
272
|
+
nonlocal error
|
|
273
|
+
error = sys.exc_info()
|
|
274
|
+
streamer.end()
|
|
275
|
+
raise
|
|
276
|
+
|
|
277
|
+
thread = Thread(target=model_generate)
|
|
278
|
+
thread.start()
|
|
279
|
+
|
|
280
|
+
completion_id = str(uuid.uuid1())
|
|
281
|
+
for new_text in streamer:
|
|
282
|
+
yield generate_completion_chunk(
|
|
283
|
+
chunk_text=new_text,
|
|
284
|
+
finish_reason=None,
|
|
285
|
+
chunk_id=completion_id,
|
|
286
|
+
model_uid=self.model_uid,
|
|
287
|
+
prompt_tokens=-1,
|
|
288
|
+
completion_tokens=-1,
|
|
289
|
+
total_tokens=-1,
|
|
290
|
+
has_choice=True,
|
|
291
|
+
has_content=True,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
if error:
|
|
295
|
+
_, err, tb = error # type: ignore
|
|
296
|
+
raise err.with_traceback(tb)
|
|
297
|
+
|
|
298
|
+
yield generate_completion_chunk(
|
|
299
|
+
chunk_text=None,
|
|
300
|
+
finish_reason="stop",
|
|
301
|
+
chunk_id=completion_id,
|
|
302
|
+
model_uid=self.model_uid,
|
|
303
|
+
prompt_tokens=-1,
|
|
304
|
+
completion_tokens=-1,
|
|
305
|
+
total_tokens=-1,
|
|
306
|
+
has_choice=True,
|
|
307
|
+
has_content=False,
|
|
308
|
+
)
|
|
@@ -24,15 +24,18 @@ from ....types import (
|
|
|
24
24
|
ChatCompletionChunk,
|
|
25
25
|
ChatCompletionMessage,
|
|
26
26
|
CompletionChunk,
|
|
27
|
+
PytorchModelConfig,
|
|
27
28
|
)
|
|
28
|
-
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
29
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
|
|
29
30
|
from ..utils import generate_chat_completion, generate_completion_chunk
|
|
30
|
-
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
31
|
+
from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
|
|
31
32
|
from .utils import cache_clean
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
34
35
|
|
|
35
36
|
|
|
37
|
+
@register_transformer
|
|
38
|
+
@register_non_default_model("qwen2-vl-instruct", "qwen2.5-vl-instruct")
|
|
36
39
|
class Qwen2VLChatModel(PytorchChatModel):
|
|
37
40
|
def __init__(self, *args, **kwargs):
|
|
38
41
|
super().__init__(*args, **kwargs)
|
|
@@ -41,6 +44,15 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
41
44
|
self._device = None
|
|
42
45
|
self._processor = None
|
|
43
46
|
|
|
47
|
+
def _sanitize_model_config(
|
|
48
|
+
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
49
|
+
) -> PytorchModelConfig:
|
|
50
|
+
pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
|
|
51
|
+
assert pytorch_model_config is not None
|
|
52
|
+
pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
|
|
53
|
+
pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
|
|
54
|
+
return pytorch_model_config
|
|
55
|
+
|
|
44
56
|
@classmethod
|
|
45
57
|
def match(
|
|
46
58
|
cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
|
|
@@ -69,9 +81,13 @@ class Qwen2VLChatModel(PytorchChatModel):
|
|
|
69
81
|
self._device = device
|
|
70
82
|
# for multiple GPU, set back to auto to make multiple devices work
|
|
71
83
|
device = "auto" if device == "cuda" else device
|
|
72
|
-
|
|
84
|
+
min_pixels = self._pytorch_model_config.get("min_pixels")
|
|
85
|
+
max_pixels = self._pytorch_model_config.get("max_pixels")
|
|
73
86
|
self._processor = AutoProcessor.from_pretrained(
|
|
74
|
-
self.model_path,
|
|
87
|
+
self.model_path,
|
|
88
|
+
trust_remote_code=True,
|
|
89
|
+
min_pixels=min_pixels,
|
|
90
|
+
max_pixels=max_pixels,
|
|
75
91
|
)
|
|
76
92
|
self._tokenizer = self._processor.tokenizer
|
|
77
93
|
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
xinference/model/llm/utils.py
CHANGED
|
@@ -31,6 +31,7 @@ from typing import (
|
|
|
31
31
|
List,
|
|
32
32
|
Optional,
|
|
33
33
|
Tuple,
|
|
34
|
+
Union,
|
|
34
35
|
cast,
|
|
35
36
|
)
|
|
36
37
|
|
|
@@ -762,7 +763,7 @@ class ChatModelMixin:
|
|
|
762
763
|
|
|
763
764
|
def _transform_messages(
|
|
764
765
|
self,
|
|
765
|
-
messages: List[ChatCompletionMessage],
|
|
766
|
+
messages: Union[List[ChatCompletionMessage], List[dict]],
|
|
766
767
|
):
|
|
767
768
|
transformed_messages = []
|
|
768
769
|
for msg in messages:
|
|
@@ -783,6 +784,15 @@ class ChatModelMixin:
|
|
|
783
784
|
new_content.append(
|
|
784
785
|
{"type": "video", "video": item["video_url"]["url"]}
|
|
785
786
|
)
|
|
787
|
+
elif "audio_url" in item:
|
|
788
|
+
new_content.append(
|
|
789
|
+
{"type": "audio", "audio": item["audio_url"]["url"]}
|
|
790
|
+
)
|
|
791
|
+
else:
|
|
792
|
+
logger.warning(
|
|
793
|
+
"Unknown message type, message: %s, this message may be ignored",
|
|
794
|
+
messages,
|
|
795
|
+
)
|
|
786
796
|
new_message = {"role": role, "content": new_content}
|
|
787
797
|
transformed_messages.append(new_message)
|
|
788
798
|
|
|
@@ -37,6 +37,7 @@ from typing import (
|
|
|
37
37
|
)
|
|
38
38
|
|
|
39
39
|
import xoscar as xo
|
|
40
|
+
from typing_extensions import NotRequired
|
|
40
41
|
|
|
41
42
|
from ....types import (
|
|
42
43
|
ChatCompletion,
|
|
@@ -81,6 +82,9 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
81
82
|
scheduling_policy: Optional[str]
|
|
82
83
|
reasoning_content: bool
|
|
83
84
|
model_quantization: Optional[str]
|
|
85
|
+
mm_processor_kwargs: NotRequired[dict[str, Any]]
|
|
86
|
+
min_pixels: NotRequired[int]
|
|
87
|
+
max_pixels: NotRequired[int]
|
|
84
88
|
|
|
85
89
|
|
|
86
90
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -170,6 +174,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
170
174
|
VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
|
|
171
175
|
VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
|
|
172
176
|
VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
|
|
177
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
|
|
178
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
|
|
173
179
|
|
|
174
180
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
|
|
175
181
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
|
|
@@ -205,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
|
205
211
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
206
212
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
|
|
207
213
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
|
|
214
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL3")
|
|
208
215
|
|
|
209
216
|
if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
|
|
210
217
|
VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
|
|
@@ -229,6 +236,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
|
|
|
229
236
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
|
|
230
237
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("gemma-3-it")
|
|
231
238
|
|
|
239
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
|
|
240
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
|
|
241
|
+
|
|
232
242
|
|
|
233
243
|
class VLLMModel(LLM):
|
|
234
244
|
def __init__(
|
|
@@ -531,6 +541,31 @@ class VLLMModel(LLM):
|
|
|
531
541
|
# Add scheduling policy if vLLM version is 0.6.3 or higher
|
|
532
542
|
if vllm.__version__ >= "0.6.3":
|
|
533
543
|
model_config.setdefault("scheduling_policy", "fcfs")
|
|
544
|
+
# init mm_processor_kwargs params
|
|
545
|
+
mm_processor_kwargs = model_config.get("mm_processor_kwargs", {})
|
|
546
|
+
if isinstance(mm_processor_kwargs, str):
|
|
547
|
+
try:
|
|
548
|
+
mm_processor_kwargs = json.loads(mm_processor_kwargs)
|
|
549
|
+
except json.JSONDecodeError:
|
|
550
|
+
logger.warning(
|
|
551
|
+
"Failed to parse mm_processor_kwargs as JSON, using default empty dict"
|
|
552
|
+
)
|
|
553
|
+
mm_processor_kwargs = {}
|
|
554
|
+
except Exception as e:
|
|
555
|
+
logger.warning(
|
|
556
|
+
f"Unexpected error parsing mm_processor_kwargs: {e}, using default empty dict"
|
|
557
|
+
)
|
|
558
|
+
mm_processor_kwargs = {}
|
|
559
|
+
pixel_params: Dict[str, int] = {}
|
|
560
|
+
if "min_pixels" in model_config:
|
|
561
|
+
pixel_params["min_pixels"] = model_config.pop("min_pixels")
|
|
562
|
+
if "max_pixels" in model_config:
|
|
563
|
+
pixel_params["max_pixels"] = model_config.pop("max_pixels")
|
|
564
|
+
if pixel_params or mm_processor_kwargs:
|
|
565
|
+
model_config["mm_processor_kwargs"] = {
|
|
566
|
+
**mm_processor_kwargs,
|
|
567
|
+
**pixel_params,
|
|
568
|
+
}
|
|
534
569
|
return model_config
|
|
535
570
|
|
|
536
571
|
@staticmethod
|
|
@@ -84,7 +84,7 @@ class WorkerWrapper:
|
|
|
84
84
|
return await self._worker_actor_ref.execute_method(method, *args, **kwargs)
|
|
85
85
|
|
|
86
86
|
def kill(self):
|
|
87
|
-
coro = xo.
|
|
87
|
+
coro = xo.destroy_actor(self._worker_actor_ref)
|
|
88
88
|
return asyncio.run_coroutine_threadsafe(coro, self._loop)
|
|
89
89
|
|
|
90
90
|
|
|
@@ -108,6 +108,7 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
|
|
|
108
108
|
self._pool_addresses = pool_addresses
|
|
109
109
|
self._loop = loop
|
|
110
110
|
self._n_worker = n_worker
|
|
111
|
+
self._is_shutdown = False
|
|
111
112
|
super().__init__(vllm_config, *args, **kwargs)
|
|
112
113
|
|
|
113
114
|
def _init_executor(self) -> None:
|
|
@@ -247,11 +248,16 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
|
|
|
247
248
|
return
|
|
248
249
|
|
|
249
250
|
def shutdown(self) -> None:
|
|
251
|
+
if self._is_shutdown:
|
|
252
|
+
return
|
|
253
|
+
|
|
250
254
|
try:
|
|
255
|
+
self._is_shutdown = True
|
|
251
256
|
futs = [worker.kill() for worker in self.workers]
|
|
252
257
|
_ = [fut.result() for fut in futs]
|
|
253
|
-
except (RuntimeError, ConnectionError):
|
|
258
|
+
except (RuntimeError, ConnectionError, xo.ActorNotExist):
|
|
254
259
|
# event loop closed already, ignore
|
|
260
|
+
# or actor already removed
|
|
255
261
|
pass
|
|
256
262
|
|
|
257
263
|
def __del__(self):
|
xinference/model/rerank/core.py
CHANGED
|
@@ -29,7 +29,7 @@ import torch.nn as nn
|
|
|
29
29
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
30
30
|
from ...device_utils import empty_cache
|
|
31
31
|
from ...types import Document, DocumentObj, Rerank, RerankTokens
|
|
32
|
-
from ..core import CacheableModelSpec, ModelDescription
|
|
32
|
+
from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
|
|
33
33
|
from ..utils import is_model_cached
|
|
34
34
|
|
|
35
35
|
logger = logging.getLogger(__name__)
|
|
@@ -56,6 +56,7 @@ class RerankModelSpec(CacheableModelSpec):
|
|
|
56
56
|
model_id: str
|
|
57
57
|
model_revision: Optional[str]
|
|
58
58
|
model_hub: str = "huggingface"
|
|
59
|
+
virtualenv: Optional[VirtualEnvSettings]
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
class RerankModelDescription(ModelDescription):
|
|
@@ -69,6 +70,10 @@ class RerankModelDescription(ModelDescription):
|
|
|
69
70
|
super().__init__(address, devices, model_path=model_path)
|
|
70
71
|
self._model_spec = model_spec
|
|
71
72
|
|
|
73
|
+
@property
|
|
74
|
+
def spec(self):
|
|
75
|
+
return self._model_spec
|
|
76
|
+
|
|
72
77
|
def to_dict(self):
|
|
73
78
|
return {
|
|
74
79
|
"model_type": "rerank",
|