xinference 1.4.1__py3-none-any.whl → 1.5.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +50 -1
  3. xinference/client/restful/restful_client.py +82 -2
  4. xinference/constants.py +3 -0
  5. xinference/core/chat_interface.py +297 -83
  6. xinference/core/model.py +1 -0
  7. xinference/core/progress_tracker.py +16 -8
  8. xinference/core/supervisor.py +45 -1
  9. xinference/core/worker.py +262 -37
  10. xinference/deploy/cmdline.py +33 -1
  11. xinference/model/audio/core.py +11 -1
  12. xinference/model/audio/megatts.py +105 -0
  13. xinference/model/audio/model_spec.json +24 -1
  14. xinference/model/audio/model_spec_modelscope.json +26 -1
  15. xinference/model/core.py +14 -0
  16. xinference/model/embedding/core.py +6 -1
  17. xinference/model/flexible/core.py +6 -1
  18. xinference/model/image/core.py +6 -1
  19. xinference/model/image/model_spec.json +17 -1
  20. xinference/model/image/model_spec_modelscope.json +17 -1
  21. xinference/model/llm/__init__.py +0 -4
  22. xinference/model/llm/core.py +4 -0
  23. xinference/model/llm/llama_cpp/core.py +40 -16
  24. xinference/model/llm/llm_family.json +415 -84
  25. xinference/model/llm/llm_family.py +24 -1
  26. xinference/model/llm/llm_family_modelscope.json +449 -0
  27. xinference/model/llm/mlx/core.py +16 -2
  28. xinference/model/llm/transformers/__init__.py +14 -0
  29. xinference/model/llm/transformers/core.py +30 -6
  30. xinference/model/llm/transformers/gemma3.py +17 -2
  31. xinference/model/llm/transformers/intern_vl.py +28 -18
  32. xinference/model/llm/transformers/minicpmv26.py +21 -2
  33. xinference/model/llm/transformers/qwen-omni.py +308 -0
  34. xinference/model/llm/transformers/qwen2_audio.py +1 -1
  35. xinference/model/llm/transformers/qwen2_vl.py +20 -4
  36. xinference/model/llm/utils.py +11 -1
  37. xinference/model/llm/vllm/core.py +35 -0
  38. xinference/model/llm/vllm/distributed_executor.py +8 -2
  39. xinference/model/rerank/core.py +6 -1
  40. xinference/model/utils.py +118 -1
  41. xinference/model/video/core.py +6 -1
  42. xinference/thirdparty/megatts3/__init__.py +0 -0
  43. xinference/thirdparty/megatts3/tts/frontend_function.py +175 -0
  44. xinference/thirdparty/megatts3/tts/gradio_api.py +93 -0
  45. xinference/thirdparty/megatts3/tts/infer_cli.py +277 -0
  46. xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py +318 -0
  47. xinference/thirdparty/megatts3/tts/modules/ar_dur/ar_dur_predictor.py +362 -0
  48. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/layers.py +64 -0
  49. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/nar_tts_modules.py +73 -0
  50. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rel_transformer.py +403 -0
  51. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rot_transformer.py +649 -0
  52. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/seq_utils.py +342 -0
  53. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/transformer.py +767 -0
  54. xinference/thirdparty/megatts3/tts/modules/llm_dit/cfm.py +309 -0
  55. xinference/thirdparty/megatts3/tts/modules/llm_dit/dit.py +180 -0
  56. xinference/thirdparty/megatts3/tts/modules/llm_dit/time_embedding.py +44 -0
  57. xinference/thirdparty/megatts3/tts/modules/llm_dit/transformer.py +230 -0
  58. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/diag_gaussian.py +67 -0
  59. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/hifigan_modules.py +283 -0
  60. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/seanet_encoder.py +38 -0
  61. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/wavvae_v3.py +60 -0
  62. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/conv.py +154 -0
  63. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/lstm.py +51 -0
  64. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/seanet.py +126 -0
  65. xinference/thirdparty/megatts3/tts/utils/audio_utils/align.py +36 -0
  66. xinference/thirdparty/megatts3/tts/utils/audio_utils/io.py +95 -0
  67. xinference/thirdparty/megatts3/tts/utils/audio_utils/plot.py +90 -0
  68. xinference/thirdparty/megatts3/tts/utils/commons/ckpt_utils.py +171 -0
  69. xinference/thirdparty/megatts3/tts/utils/commons/hparams.py +215 -0
  70. xinference/thirdparty/megatts3/tts/utils/text_utils/dict.json +1 -0
  71. xinference/thirdparty/megatts3/tts/utils/text_utils/ph_tone_convert.py +94 -0
  72. xinference/thirdparty/megatts3/tts/utils/text_utils/split_text.py +90 -0
  73. xinference/thirdparty/megatts3/tts/utils/text_utils/text_encoder.py +280 -0
  74. xinference/types.py +10 -0
  75. xinference/utils.py +54 -0
  76. xinference/web/ui/build/asset-manifest.json +6 -6
  77. xinference/web/ui/build/index.html +1 -1
  78. xinference/web/ui/build/static/css/main.0f6523be.css +2 -0
  79. xinference/web/ui/build/static/css/main.0f6523be.css.map +1 -0
  80. xinference/web/ui/build/static/js/main.58bd483c.js +3 -0
  81. xinference/web/ui/build/static/js/main.58bd483c.js.map +1 -0
  82. xinference/web/ui/node_modules/.cache/babel-loader/3bff8cbe9141f937f4d98879a9771b0f48e0e4e0dbee8e647adbfe23859e7048.json +1 -0
  83. xinference/web/ui/node_modules/.cache/babel-loader/4500b1a622a031011f0a291701e306b87e08cbc749c50e285103536b85b6a914.json +1 -0
  84. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +1 -0
  85. xinference/web/ui/node_modules/.cache/babel-loader/69081049f0c7447544b7cfd73dd13d8846c02fe5febe4d81587e95c89a412d5b.json +1 -0
  86. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +1 -0
  87. xinference/web/ui/node_modules/.cache/babel-loader/bf2b211b0d1b6465eff512d64c869d748f803c5651a7c24e48de6ea3484a7bfe.json +1 -0
  88. xinference/web/ui/src/locales/en.json +2 -1
  89. xinference/web/ui/src/locales/zh.json +2 -1
  90. {xinference-1.4.1.dist-info → xinference-1.5.0.post1.dist-info}/METADATA +129 -114
  91. {xinference-1.4.1.dist-info → xinference-1.5.0.post1.dist-info}/RECORD +96 -60
  92. {xinference-1.4.1.dist-info → xinference-1.5.0.post1.dist-info}/WHEEL +1 -1
  93. xinference/web/ui/build/static/css/main.b494ae7e.css +0 -2
  94. xinference/web/ui/build/static/css/main.b494ae7e.css.map +0 -1
  95. xinference/web/ui/build/static/js/main.5ca4eea1.js +0 -3
  96. xinference/web/ui/build/static/js/main.5ca4eea1.js.map +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/0f0967acaec5df1d45b80010949c258d64297ebbb0f44b8bb3afcbd45c6f0ec4.json +0 -1
  98. xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +0 -1
  99. xinference/web/ui/node_modules/.cache/babel-loader/68249645124f37d01eef83b1d897e751f895bea919b6fb466f907c1f87cebc84.json +0 -1
  100. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +0 -1
  101. /xinference/web/ui/build/static/js/{main.5ca4eea1.js.LICENSE.txt → main.58bd483c.js.LICENSE.txt} +0 -0
  102. {xinference-1.4.1.dist-info → xinference-1.5.0.post1.dist-info}/entry_points.txt +0 -0
  103. {xinference-1.4.1.dist-info → xinference-1.5.0.post1.dist-info/licenses}/LICENSE +0 -0
  104. {xinference-1.4.1.dist-info → xinference-1.5.0.post1.dist-info}/top_level.txt +0 -0
@@ -19,14 +19,14 @@ from typing import Dict, Iterator, List, Optional, Union
19
19
  import torch
20
20
 
21
21
  from ....types import ChatCompletion, ChatCompletionChunk
22
- from ..llm_family import LLMFamilyV1, LLMSpecV1
22
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
23
23
  from ..utils import (
24
24
  _decode_image,
25
25
  generate_chat_completion,
26
26
  generate_completion_chunk,
27
27
  parse_messages,
28
28
  )
29
- from .core import PytorchChatModel, PytorchGenerateConfig
29
+ from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
30
30
  from .utils import cache_clean
31
31
 
32
32
  logger = logging.getLogger(__name__)
@@ -232,6 +232,10 @@ def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=
232
232
  return pixel_values, num_patches_list
233
233
 
234
234
 
235
+ @register_transformer
236
+ @register_non_default_model(
237
+ "internvl-chat", "internvl2", "Internvl2.5", "Internvl2.5-MPO", "InternVL3"
238
+ )
235
239
  class InternVLChatModel(PytorchChatModel):
236
240
  def __init__(self, *args, **kwargs):
237
241
  super().__init__(*args, **kwargs)
@@ -257,6 +261,8 @@ class InternVLChatModel(PytorchChatModel):
257
261
  def _split_model(self):
258
262
  import math
259
263
 
264
+ from transformers import AutoConfig
265
+
260
266
  device_map = {}
261
267
  world_size = torch.cuda.device_count()
262
268
  # single gpu
@@ -265,22 +271,26 @@ class InternVLChatModel(PytorchChatModel):
265
271
  model_size = f"{self.model_spec.model_size_in_billions}B"
266
272
  model_name = self.model_family.model_name.lower().replace("-mpo", "")
267
273
  model_name = f"{model_name}-{model_size}"
268
- num_layers = {
269
- "internvl2-1B": 24,
270
- "internvl2-2B": 24,
271
- "internvl2-4B": 32,
272
- "internvl2-8B": 32,
273
- "internvl2-26B": 48,
274
- "internvl2-40B": 60,
275
- "internvl2-76B": 80,
276
- "internvl2.5-1B": 24,
277
- "internvl2.5-2B": 24,
278
- "internvl2.5-4B": 36,
279
- "internvl2.5-8B": 32,
280
- "internvl2.5-26B": 48,
281
- "internvl2.5-38B": 64,
282
- "internvl2.5-78B": 80,
283
- }[model_name]
274
+ if "internvl3" in model_name.lower():
275
+ config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
276
+ num_layers = config.llm_config.num_hidden_layers
277
+ else:
278
+ num_layers = {
279
+ "internvl2-1B": 24,
280
+ "internvl2-2B": 24,
281
+ "internvl2-4B": 32,
282
+ "internvl2-8B": 32,
283
+ "internvl2-26B": 48,
284
+ "internvl2-40B": 60,
285
+ "internvl2-76B": 80,
286
+ "internvl2.5-1B": 24,
287
+ "internvl2.5-2B": 24,
288
+ "internvl2.5-4B": 36,
289
+ "internvl2.5-8B": 32,
290
+ "internvl2.5-26B": 48,
291
+ "internvl2.5-38B": 64,
292
+ "internvl2.5-78B": 80,
293
+ }[model_name]
284
294
 
285
295
  # Since the first GPU will be used for ViT, treat it as half a GPU.
286
296
  num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
@@ -20,7 +20,12 @@ import torch
20
20
  from PIL import Image
21
21
 
22
22
  from ....core.scheduler import InferenceRequest
23
- from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
23
+ from ....types import (
24
+ ChatCompletion,
25
+ ChatCompletionChunk,
26
+ CompletionChunk,
27
+ PytorchModelConfig,
28
+ )
24
29
  from ...utils import select_device
25
30
  from ..llm_family import LLMFamilyV1, LLMSpecV1
26
31
  from ..utils import (
@@ -52,6 +57,15 @@ class MiniCPMV26Model(PytorchChatModel):
52
57
  return True
53
58
  return False
54
59
 
60
+ def _sanitize_model_config(
61
+ self, pytorch_model_config: Optional[PytorchModelConfig]
62
+ ) -> PytorchModelConfig:
63
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
64
+ assert pytorch_model_config is not None
65
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
66
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
67
+ return pytorch_model_config
68
+
55
69
  def _get_model_class(self):
56
70
  from transformers import AutoModel
57
71
 
@@ -99,8 +113,13 @@ class MiniCPMV26Model(PytorchChatModel):
99
113
  self.model_path,
100
114
  trust_remote_code=True,
101
115
  )
116
+ min_pixels = self._pytorch_model_config.get("min_pixels")
117
+ max_pixels = self._pytorch_model_config.get("max_pixels")
102
118
  self._processor = AutoProcessor.from_pretrained(
103
- self.model_path, trust_remote_code=True
119
+ self.model_path,
120
+ trust_remote_code=True,
121
+ min_pixels=min_pixels,
122
+ max_pixels=max_pixels,
104
123
  )
105
124
  self._device = self._model.device
106
125
  self._save_tensorizer()
@@ -0,0 +1,308 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import base64
16
+ import importlib.util
17
+ import io
18
+ import logging
19
+ import sys
20
+ import time
21
+ import uuid
22
+ from typing import Dict, Iterator, List, Optional, Union
23
+
24
+ from ....model.utils import select_device
25
+ from ....types import (
26
+ ChatCompletion,
27
+ ChatCompletionAudio,
28
+ ChatCompletionChoice,
29
+ ChatCompletionChunk,
30
+ ChatCompletionMessage,
31
+ CompletionChunk,
32
+ CompletionUsage,
33
+ )
34
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
35
+ from ..utils import generate_completion_chunk
36
+ from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
37
+ from .utils import cache_clean
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ DEFAULT_SYSTEM_PROMPT = (
42
+ "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
43
+ "capable of perceiving auditory and visual inputs, as well as generating text and speech."
44
+ )
45
+
46
+
47
+ @register_transformer
48
+ @register_non_default_model("qwen2.5-omni")
49
+ class Qwen2_5OmniChatModel(PytorchChatModel):
50
+ def __init__(self, *args, **kwargs):
51
+ super().__init__(*args, **kwargs)
52
+
53
+ self._tokenizer = None
54
+ self._model = None
55
+ self._device = None
56
+ self._processor = None
57
+
58
+ @classmethod
59
+ def match(
60
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
61
+ ) -> bool:
62
+ if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
63
+ return False
64
+ llm_family = model_family.model_family or model_family.model_name
65
+ if "qwen2.5-omni".lower() in llm_family.lower():
66
+ return True
67
+ return False
68
+
69
+ def load(self):
70
+ from transformers import (
71
+ Qwen2_5OmniForConditionalGeneration,
72
+ Qwen2_5OmniProcessor,
73
+ )
74
+
75
+ device = self._pytorch_model_config.get("device", "auto")
76
+ device = select_device(device)
77
+ self._device = device
78
+ # for multiple GPU, set back to auto to make multiple devices work
79
+ device = "auto" if device == "cuda" else device
80
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
81
+ kwargs = (
82
+ {}
83
+ if not flash_attn_installed
84
+ else {"attn_implementation": "flash_attention_2"}
85
+ )
86
+ logger.debug("Loading model with extra kwargs: %s", kwargs)
87
+
88
+ self._processor = Qwen2_5OmniProcessor.from_pretrained(
89
+ self.model_path, trust_remote_code=True
90
+ )
91
+ self._tokenizer = self._processor.tokenizer
92
+ self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
93
+ self.model_path,
94
+ torch_dtype="auto",
95
+ device_map=device,
96
+ trust_remote_code=True,
97
+ **kwargs,
98
+ )
99
+
100
+ @cache_clean
101
+ def chat(
102
+ self,
103
+ messages: List[Dict],
104
+ generate_config: Optional[PytorchGenerateConfig] = None,
105
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
106
+ messages = self._transform_messages(messages)
107
+
108
+ generate_config = generate_config if generate_config else {}
109
+
110
+ stream = generate_config.get("stream", False) if generate_config else False
111
+
112
+ if stream:
113
+ it = self._generate_stream(messages, generate_config)
114
+ return self._to_chat_completion_chunks(it)
115
+ else:
116
+ c = self._generate(messages, generate_config)
117
+ return c
118
+
119
+ def _transform_messages(
120
+ self,
121
+ messages: Union[List[ChatCompletionMessage], List[dict]],
122
+ ):
123
+ messages = super()._transform_messages(messages)
124
+ if messages[0]["role"] != "system":
125
+ messages.insert(
126
+ 0,
127
+ {
128
+ "role": "system",
129
+ "content": [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}], # type: ignore
130
+ },
131
+ )
132
+ else:
133
+ logger.debug("Force to set system prompt")
134
+ messages[0]["content"] = [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}] # type: ignore
135
+ return messages
136
+
137
+ def _generate(
138
+ self, messages: List, config: PytorchGenerateConfig = {}
139
+ ) -> ChatCompletion:
140
+ import soundfile as sf
141
+ from qwen_omni_utils import process_mm_info
142
+
143
+ use_audio_in_video = config.get("use_audio_in_video", True)
144
+ voice = config.get("voice", "Chelsie")
145
+
146
+ text = self._processor.apply_chat_template(
147
+ messages, tokenize=False, add_generation_prompt=True
148
+ )
149
+ audios, images, videos = process_mm_info(
150
+ messages, use_audio_in_video=use_audio_in_video
151
+ )
152
+ logger.debug(
153
+ "Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
154
+ )
155
+ inputs = self._processor(
156
+ text=text,
157
+ images=images,
158
+ audio=audios,
159
+ videos=videos,
160
+ padding=True,
161
+ return_tensors="pt",
162
+ use_audio_in_video=use_audio_in_video,
163
+ )
164
+ inputs = inputs.to(self._device)
165
+
166
+ # Inference: Generation of the output
167
+ generated_ids, audio = self._model.generate(
168
+ **inputs,
169
+ speaker=voice,
170
+ max_new_tokens=config.get("max_tokens", 512),
171
+ temperature=config.get("temperature", 1),
172
+ use_audio_in_video=use_audio_in_video,
173
+ )
174
+ generated_ids_trimmed = [
175
+ out_ids[len(in_ids) :]
176
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
177
+ ]
178
+ output_text = self._processor.batch_decode(
179
+ generated_ids_trimmed,
180
+ skip_special_tokens=True,
181
+ clean_up_tokenization_spaces=False,
182
+ )[0]
183
+
184
+ wav_io = io.BytesIO()
185
+ sf.write(
186
+ wav_io,
187
+ audio.reshape(-1).detach().cpu().numpy(),
188
+ samplerate=24000,
189
+ format="WAV",
190
+ )
191
+ wav_bytes = wav_io.getvalue()
192
+ audio_content = base64.b64encode(wav_bytes).decode()
193
+
194
+ return ChatCompletion(
195
+ id="chat" + str(uuid.uuid1()),
196
+ object="chat.completion",
197
+ created=int(time.time()),
198
+ model=self.model_uid,
199
+ choices=[
200
+ ChatCompletionChoice(
201
+ index=0,
202
+ message={
203
+ "role": "assistant",
204
+ "content": output_text,
205
+ "audio": ChatCompletionAudio(
206
+ id="audio" + str(uuid.uuid1()),
207
+ data=audio_content,
208
+ expires_at=int(time.time()),
209
+ transcript="",
210
+ ),
211
+ },
212
+ finish_reason="stop",
213
+ )
214
+ ],
215
+ usage=CompletionUsage(
216
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
217
+ ),
218
+ )
219
+
220
+ def _generate_stream(
221
+ self, messages: List, config: PytorchGenerateConfig = {}
222
+ ) -> Iterator[CompletionChunk]:
223
+ from threading import Thread
224
+
225
+ from qwen_omni_utils import process_mm_info
226
+ from transformers import TextIteratorStreamer
227
+
228
+ use_audio_in_video = config.get("use_audio_in_video", True)
229
+ voice = config.get("voice", "Chelsie")
230
+
231
+ text = self._processor.apply_chat_template(
232
+ messages, tokenize=False, add_generation_prompt=True
233
+ )
234
+ audios, images, videos = process_mm_info(
235
+ messages, use_audio_in_video=use_audio_in_video
236
+ )
237
+ logger.debug(
238
+ "Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
239
+ )
240
+ inputs = self._processor(
241
+ text=text,
242
+ images=images,
243
+ audio=audios,
244
+ videos=videos,
245
+ padding=True,
246
+ return_tensors="pt",
247
+ use_audio_in_video=use_audio_in_video,
248
+ )
249
+ inputs = inputs.to(self._device)
250
+
251
+ tokenizer = self._tokenizer
252
+ streamer = TextIteratorStreamer(
253
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
254
+ )
255
+
256
+ # TODO(xuye): Cannot find a way to streaming output,
257
+ # will implement it when it's supported
258
+
259
+ gen_kwargs = {
260
+ "max_new_tokens": config.get("max_tokens", 512),
261
+ "temperature": config.get("temperature", 1),
262
+ "streamer": streamer,
263
+ "speaker": voice,
264
+ **inputs,
265
+ }
266
+ error = None
267
+
268
+ def model_generate():
269
+ try:
270
+ return self._model.generate(**gen_kwargs)
271
+ except Exception:
272
+ nonlocal error
273
+ error = sys.exc_info()
274
+ streamer.end()
275
+ raise
276
+
277
+ thread = Thread(target=model_generate)
278
+ thread.start()
279
+
280
+ completion_id = str(uuid.uuid1())
281
+ for new_text in streamer:
282
+ yield generate_completion_chunk(
283
+ chunk_text=new_text,
284
+ finish_reason=None,
285
+ chunk_id=completion_id,
286
+ model_uid=self.model_uid,
287
+ prompt_tokens=-1,
288
+ completion_tokens=-1,
289
+ total_tokens=-1,
290
+ has_choice=True,
291
+ has_content=True,
292
+ )
293
+
294
+ if error:
295
+ _, err, tb = error # type: ignore
296
+ raise err.with_traceback(tb)
297
+
298
+ yield generate_completion_chunk(
299
+ chunk_text=None,
300
+ finish_reason="stop",
301
+ chunk_id=completion_id,
302
+ model_uid=self.model_uid,
303
+ prompt_tokens=-1,
304
+ completion_tokens=-1,
305
+ total_tokens=-1,
306
+ has_choice=True,
307
+ has_content=False,
308
+ )
@@ -74,7 +74,7 @@ class Qwen2AudioChatModel(PytorchChatModel):
74
74
 
75
75
  def _transform_messages(
76
76
  self,
77
- messages: List[ChatCompletionMessage],
77
+ messages: Union[List[ChatCompletionMessage], List[dict]],
78
78
  ):
79
79
  import librosa
80
80
 
@@ -24,15 +24,18 @@ from ....types import (
24
24
  ChatCompletionChunk,
25
25
  ChatCompletionMessage,
26
26
  CompletionChunk,
27
+ PytorchModelConfig,
27
28
  )
28
- from ..llm_family import LLMFamilyV1, LLMSpecV1
29
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
29
30
  from ..utils import generate_chat_completion, generate_completion_chunk
30
- from .core import PytorchChatModel, PytorchGenerateConfig
31
+ from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
31
32
  from .utils import cache_clean
32
33
 
33
34
  logger = logging.getLogger(__name__)
34
35
 
35
36
 
37
+ @register_transformer
38
+ @register_non_default_model("qwen2-vl-instruct", "qwen2.5-vl-instruct")
36
39
  class Qwen2VLChatModel(PytorchChatModel):
37
40
  def __init__(self, *args, **kwargs):
38
41
  super().__init__(*args, **kwargs)
@@ -41,6 +44,15 @@ class Qwen2VLChatModel(PytorchChatModel):
41
44
  self._device = None
42
45
  self._processor = None
43
46
 
47
+ def _sanitize_model_config(
48
+ self, pytorch_model_config: Optional[PytorchModelConfig]
49
+ ) -> PytorchModelConfig:
50
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
51
+ assert pytorch_model_config is not None
52
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
53
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
54
+ return pytorch_model_config
55
+
44
56
  @classmethod
45
57
  def match(
46
58
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
@@ -69,9 +81,13 @@ class Qwen2VLChatModel(PytorchChatModel):
69
81
  self._device = device
70
82
  # for multiple GPU, set back to auto to make multiple devices work
71
83
  device = "auto" if device == "cuda" else device
72
-
84
+ min_pixels = self._pytorch_model_config.get("min_pixels")
85
+ max_pixels = self._pytorch_model_config.get("max_pixels")
73
86
  self._processor = AutoProcessor.from_pretrained(
74
- self.model_path, trust_remote_code=True
87
+ self.model_path,
88
+ trust_remote_code=True,
89
+ min_pixels=min_pixels,
90
+ max_pixels=max_pixels,
75
91
  )
76
92
  self._tokenizer = self._processor.tokenizer
77
93
  flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
@@ -31,6 +31,7 @@ from typing import (
31
31
  List,
32
32
  Optional,
33
33
  Tuple,
34
+ Union,
34
35
  cast,
35
36
  )
36
37
 
@@ -762,7 +763,7 @@ class ChatModelMixin:
762
763
 
763
764
  def _transform_messages(
764
765
  self,
765
- messages: List[ChatCompletionMessage],
766
+ messages: Union[List[ChatCompletionMessage], List[dict]],
766
767
  ):
767
768
  transformed_messages = []
768
769
  for msg in messages:
@@ -783,6 +784,15 @@ class ChatModelMixin:
783
784
  new_content.append(
784
785
  {"type": "video", "video": item["video_url"]["url"]}
785
786
  )
787
+ elif "audio_url" in item:
788
+ new_content.append(
789
+ {"type": "audio", "audio": item["audio_url"]["url"]}
790
+ )
791
+ else:
792
+ logger.warning(
793
+ "Unknown message type, message: %s, this message may be ignored",
794
+ messages,
795
+ )
786
796
  new_message = {"role": role, "content": new_content}
787
797
  transformed_messages.append(new_message)
788
798
 
@@ -37,6 +37,7 @@ from typing import (
37
37
  )
38
38
 
39
39
  import xoscar as xo
40
+ from typing_extensions import NotRequired
40
41
 
41
42
  from ....types import (
42
43
  ChatCompletion,
@@ -81,6 +82,9 @@ class VLLMModelConfig(TypedDict, total=False):
81
82
  scheduling_policy: Optional[str]
82
83
  reasoning_content: bool
83
84
  model_quantization: Optional[str]
85
+ mm_processor_kwargs: NotRequired[dict[str, Any]]
86
+ min_pixels: NotRequired[int]
87
+ max_pixels: NotRequired[int]
84
88
 
85
89
 
86
90
  class VLLMGenerateConfig(TypedDict, total=False):
@@ -170,6 +174,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
170
174
  VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
171
175
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-qwen")
172
176
  VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
177
+ VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
178
+ VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
173
179
 
174
180
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
175
181
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -205,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
205
211
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
206
212
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
207
213
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
214
+ VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL3")
208
215
 
209
216
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
210
217
  VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
@@ -229,6 +236,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
229
236
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
230
237
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("gemma-3-it")
231
238
 
239
+ if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
240
+ VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
241
+
232
242
 
233
243
  class VLLMModel(LLM):
234
244
  def __init__(
@@ -531,6 +541,31 @@ class VLLMModel(LLM):
531
541
  # Add scheduling policy if vLLM version is 0.6.3 or higher
532
542
  if vllm.__version__ >= "0.6.3":
533
543
  model_config.setdefault("scheduling_policy", "fcfs")
544
+ # init mm_processor_kwargs params
545
+ mm_processor_kwargs = model_config.get("mm_processor_kwargs", {})
546
+ if isinstance(mm_processor_kwargs, str):
547
+ try:
548
+ mm_processor_kwargs = json.loads(mm_processor_kwargs)
549
+ except json.JSONDecodeError:
550
+ logger.warning(
551
+ "Failed to parse mm_processor_kwargs as JSON, using default empty dict"
552
+ )
553
+ mm_processor_kwargs = {}
554
+ except Exception as e:
555
+ logger.warning(
556
+ f"Unexpected error parsing mm_processor_kwargs: {e}, using default empty dict"
557
+ )
558
+ mm_processor_kwargs = {}
559
+ pixel_params: Dict[str, int] = {}
560
+ if "min_pixels" in model_config:
561
+ pixel_params["min_pixels"] = model_config.pop("min_pixels")
562
+ if "max_pixels" in model_config:
563
+ pixel_params["max_pixels"] = model_config.pop("max_pixels")
564
+ if pixel_params or mm_processor_kwargs:
565
+ model_config["mm_processor_kwargs"] = {
566
+ **mm_processor_kwargs,
567
+ **pixel_params,
568
+ }
534
569
  return model_config
535
570
 
536
571
  @staticmethod
@@ -84,7 +84,7 @@ class WorkerWrapper:
84
84
  return await self._worker_actor_ref.execute_method(method, *args, **kwargs)
85
85
 
86
86
  def kill(self):
87
- coro = xo.kill_actor(self._worker_actor_ref)
87
+ coro = xo.destroy_actor(self._worker_actor_ref)
88
88
  return asyncio.run_coroutine_threadsafe(coro, self._loop)
89
89
 
90
90
 
@@ -108,6 +108,7 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
108
108
  self._pool_addresses = pool_addresses
109
109
  self._loop = loop
110
110
  self._n_worker = n_worker
111
+ self._is_shutdown = False
111
112
  super().__init__(vllm_config, *args, **kwargs)
112
113
 
113
114
  def _init_executor(self) -> None:
@@ -247,11 +248,16 @@ class XinferenceDistributedExecutor(DistributedExecutorBase):
247
248
  return
248
249
 
249
250
  def shutdown(self) -> None:
251
+ if self._is_shutdown:
252
+ return
253
+
250
254
  try:
255
+ self._is_shutdown = True
251
256
  futs = [worker.kill() for worker in self.workers]
252
257
  _ = [fut.result() for fut in futs]
253
- except (RuntimeError, ConnectionError):
258
+ except (RuntimeError, ConnectionError, xo.ActorNotExist):
254
259
  # event loop closed already, ignore
260
+ # or actor already removed
255
261
  pass
256
262
 
257
263
  def __del__(self):
@@ -29,7 +29,7 @@ import torch.nn as nn
29
29
  from ...constants import XINFERENCE_CACHE_DIR
30
30
  from ...device_utils import empty_cache
31
31
  from ...types import Document, DocumentObj, Rerank, RerankTokens
32
- from ..core import CacheableModelSpec, ModelDescription
32
+ from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
33
33
  from ..utils import is_model_cached
34
34
 
35
35
  logger = logging.getLogger(__name__)
@@ -56,6 +56,7 @@ class RerankModelSpec(CacheableModelSpec):
56
56
  model_id: str
57
57
  model_revision: Optional[str]
58
58
  model_hub: str = "huggingface"
59
+ virtualenv: Optional[VirtualEnvSettings]
59
60
 
60
61
 
61
62
  class RerankModelDescription(ModelDescription):
@@ -69,6 +70,10 @@ class RerankModelDescription(ModelDescription):
69
70
  super().__init__(address, devices, model_path=model_path)
70
71
  self._model_spec = model_spec
71
72
 
73
+ @property
74
+ def spec(self):
75
+ return self._model_spec
76
+
72
77
  def to_dict(self):
73
78
  return {
74
79
  "model_type": "rerank",