xinference 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (132) hide show
  1. xinference/_compat.py +1 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +54 -1
  4. xinference/client/restful/restful_client.py +82 -2
  5. xinference/constants.py +3 -0
  6. xinference/core/chat_interface.py +297 -83
  7. xinference/core/model.py +24 -3
  8. xinference/core/progress_tracker.py +16 -8
  9. xinference/core/supervisor.py +51 -1
  10. xinference/core/worker.py +315 -47
  11. xinference/deploy/cmdline.py +33 -1
  12. xinference/model/audio/core.py +11 -1
  13. xinference/model/audio/megatts.py +105 -0
  14. xinference/model/audio/model_spec.json +24 -1
  15. xinference/model/audio/model_spec_modelscope.json +26 -1
  16. xinference/model/core.py +14 -0
  17. xinference/model/embedding/core.py +6 -1
  18. xinference/model/flexible/core.py +6 -1
  19. xinference/model/image/core.py +6 -1
  20. xinference/model/image/model_spec.json +17 -1
  21. xinference/model/image/model_spec_modelscope.json +17 -1
  22. xinference/model/llm/__init__.py +4 -6
  23. xinference/model/llm/core.py +5 -0
  24. xinference/model/llm/llama_cpp/core.py +46 -17
  25. xinference/model/llm/llm_family.json +530 -85
  26. xinference/model/llm/llm_family.py +24 -1
  27. xinference/model/llm/llm_family_modelscope.json +572 -1
  28. xinference/model/llm/mlx/core.py +16 -2
  29. xinference/model/llm/reasoning_parser.py +3 -3
  30. xinference/model/llm/sglang/core.py +111 -13
  31. xinference/model/llm/transformers/__init__.py +14 -0
  32. xinference/model/llm/transformers/core.py +31 -6
  33. xinference/model/llm/transformers/deepseek_vl.py +1 -1
  34. xinference/model/llm/transformers/deepseek_vl2.py +287 -0
  35. xinference/model/llm/transformers/gemma3.py +17 -2
  36. xinference/model/llm/transformers/intern_vl.py +28 -18
  37. xinference/model/llm/transformers/minicpmv26.py +21 -2
  38. xinference/model/llm/transformers/qwen-omni.py +308 -0
  39. xinference/model/llm/transformers/qwen2_audio.py +1 -1
  40. xinference/model/llm/transformers/qwen2_vl.py +20 -4
  41. xinference/model/llm/utils.py +37 -15
  42. xinference/model/llm/vllm/core.py +184 -8
  43. xinference/model/llm/vllm/distributed_executor.py +320 -0
  44. xinference/model/rerank/core.py +22 -12
  45. xinference/model/utils.py +118 -1
  46. xinference/model/video/core.py +6 -1
  47. xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
  48. xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
  49. xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
  50. xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
  51. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
  52. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
  53. xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
  54. xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
  55. xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
  56. xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
  57. xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
  58. xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
  59. xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
  60. xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
  61. xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
  62. xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
  63. xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
  64. xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
  65. xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
  66. xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
  67. xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
  68. xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
  69. xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
  70. xinference/thirdparty/megatts3/__init__.py +0 -0
  71. xinference/thirdparty/megatts3/tts/frontend_function.py +175 -0
  72. xinference/thirdparty/megatts3/tts/gradio_api.py +93 -0
  73. xinference/thirdparty/megatts3/tts/infer_cli.py +277 -0
  74. xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py +318 -0
  75. xinference/thirdparty/megatts3/tts/modules/ar_dur/ar_dur_predictor.py +362 -0
  76. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/layers.py +64 -0
  77. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/nar_tts_modules.py +73 -0
  78. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rel_transformer.py +403 -0
  79. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rot_transformer.py +649 -0
  80. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/seq_utils.py +342 -0
  81. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/transformer.py +767 -0
  82. xinference/thirdparty/megatts3/tts/modules/llm_dit/cfm.py +309 -0
  83. xinference/thirdparty/megatts3/tts/modules/llm_dit/dit.py +180 -0
  84. xinference/thirdparty/megatts3/tts/modules/llm_dit/time_embedding.py +44 -0
  85. xinference/thirdparty/megatts3/tts/modules/llm_dit/transformer.py +230 -0
  86. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/diag_gaussian.py +67 -0
  87. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/hifigan_modules.py +283 -0
  88. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/seanet_encoder.py +38 -0
  89. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/wavvae_v3.py +60 -0
  90. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/conv.py +154 -0
  91. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/lstm.py +51 -0
  92. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/seanet.py +126 -0
  93. xinference/thirdparty/megatts3/tts/utils/audio_utils/align.py +36 -0
  94. xinference/thirdparty/megatts3/tts/utils/audio_utils/io.py +95 -0
  95. xinference/thirdparty/megatts3/tts/utils/audio_utils/plot.py +90 -0
  96. xinference/thirdparty/megatts3/tts/utils/commons/ckpt_utils.py +171 -0
  97. xinference/thirdparty/megatts3/tts/utils/commons/hparams.py +215 -0
  98. xinference/thirdparty/megatts3/tts/utils/text_utils/dict.json +1 -0
  99. xinference/thirdparty/megatts3/tts/utils/text_utils/ph_tone_convert.py +94 -0
  100. xinference/thirdparty/megatts3/tts/utils/text_utils/split_text.py +90 -0
  101. xinference/thirdparty/megatts3/tts/utils/text_utils/text_encoder.py +280 -0
  102. xinference/types.py +10 -0
  103. xinference/utils.py +54 -0
  104. xinference/web/ui/build/asset-manifest.json +6 -6
  105. xinference/web/ui/build/index.html +1 -1
  106. xinference/web/ui/build/static/css/main.0f6523be.css +2 -0
  107. xinference/web/ui/build/static/css/main.0f6523be.css.map +1 -0
  108. xinference/web/ui/build/static/js/main.58bd483c.js +3 -0
  109. xinference/web/ui/build/static/js/main.58bd483c.js.map +1 -0
  110. xinference/web/ui/node_modules/.cache/babel-loader/3bff8cbe9141f937f4d98879a9771b0f48e0e4e0dbee8e647adbfe23859e7048.json +1 -0
  111. xinference/web/ui/node_modules/.cache/babel-loader/4500b1a622a031011f0a291701e306b87e08cbc749c50e285103536b85b6a914.json +1 -0
  112. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +1 -0
  113. xinference/web/ui/node_modules/.cache/babel-loader/69081049f0c7447544b7cfd73dd13d8846c02fe5febe4d81587e95c89a412d5b.json +1 -0
  114. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/bf2b211b0d1b6465eff512d64c869d748f803c5651a7c24e48de6ea3484a7bfe.json +1 -0
  116. xinference/web/ui/src/locales/en.json +2 -1
  117. xinference/web/ui/src/locales/zh.json +2 -1
  118. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/METADATA +128 -115
  119. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/RECORD +124 -63
  120. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/WHEEL +1 -1
  121. xinference/web/ui/build/static/css/main.b494ae7e.css +0 -2
  122. xinference/web/ui/build/static/css/main.b494ae7e.css.map +0 -1
  123. xinference/web/ui/build/static/js/main.3cea968e.js +0 -3
  124. xinference/web/ui/build/static/js/main.3cea968e.js.map +0 -1
  125. xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +0 -1
  126. xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +0 -1
  129. /xinference/web/ui/build/static/js/{main.3cea968e.js.LICENSE.txt → main.58bd483c.js.LICENSE.txt} +0 -0
  130. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/entry_points.txt +0 -0
  131. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info/licenses}/LICENSE +0 -0
  132. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/top_level.txt +0 -0
@@ -19,14 +19,14 @@ from typing import Dict, Iterator, List, Optional, Union
19
19
  import torch
20
20
 
21
21
  from ....types import ChatCompletion, ChatCompletionChunk
22
- from ..llm_family import LLMFamilyV1, LLMSpecV1
22
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
23
23
  from ..utils import (
24
24
  _decode_image,
25
25
  generate_chat_completion,
26
26
  generate_completion_chunk,
27
27
  parse_messages,
28
28
  )
29
- from .core import PytorchChatModel, PytorchGenerateConfig
29
+ from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
30
30
  from .utils import cache_clean
31
31
 
32
32
  logger = logging.getLogger(__name__)
@@ -232,6 +232,10 @@ def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=
232
232
  return pixel_values, num_patches_list
233
233
 
234
234
 
235
+ @register_transformer
236
+ @register_non_default_model(
237
+ "internvl-chat", "internvl2", "Internvl2.5", "Internvl2.5-MPO", "InternVL3"
238
+ )
235
239
  class InternVLChatModel(PytorchChatModel):
236
240
  def __init__(self, *args, **kwargs):
237
241
  super().__init__(*args, **kwargs)
@@ -257,6 +261,8 @@ class InternVLChatModel(PytorchChatModel):
257
261
  def _split_model(self):
258
262
  import math
259
263
 
264
+ from transformers import AutoConfig
265
+
260
266
  device_map = {}
261
267
  world_size = torch.cuda.device_count()
262
268
  # single gpu
@@ -265,22 +271,26 @@ class InternVLChatModel(PytorchChatModel):
265
271
  model_size = f"{self.model_spec.model_size_in_billions}B"
266
272
  model_name = self.model_family.model_name.lower().replace("-mpo", "")
267
273
  model_name = f"{model_name}-{model_size}"
268
- num_layers = {
269
- "internvl2-1B": 24,
270
- "internvl2-2B": 24,
271
- "internvl2-4B": 32,
272
- "internvl2-8B": 32,
273
- "internvl2-26B": 48,
274
- "internvl2-40B": 60,
275
- "internvl2-76B": 80,
276
- "internvl2.5-1B": 24,
277
- "internvl2.5-2B": 24,
278
- "internvl2.5-4B": 36,
279
- "internvl2.5-8B": 32,
280
- "internvl2.5-26B": 48,
281
- "internvl2.5-38B": 64,
282
- "internvl2.5-78B": 80,
283
- }[model_name]
274
+ if "internvl3" in model_name.lower():
275
+ config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
276
+ num_layers = config.llm_config.num_hidden_layers
277
+ else:
278
+ num_layers = {
279
+ "internvl2-1B": 24,
280
+ "internvl2-2B": 24,
281
+ "internvl2-4B": 32,
282
+ "internvl2-8B": 32,
283
+ "internvl2-26B": 48,
284
+ "internvl2-40B": 60,
285
+ "internvl2-76B": 80,
286
+ "internvl2.5-1B": 24,
287
+ "internvl2.5-2B": 24,
288
+ "internvl2.5-4B": 36,
289
+ "internvl2.5-8B": 32,
290
+ "internvl2.5-26B": 48,
291
+ "internvl2.5-38B": 64,
292
+ "internvl2.5-78B": 80,
293
+ }[model_name]
284
294
 
285
295
  # Since the first GPU will be used for ViT, treat it as half a GPU.
286
296
  num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
@@ -20,7 +20,12 @@ import torch
20
20
  from PIL import Image
21
21
 
22
22
  from ....core.scheduler import InferenceRequest
23
- from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
23
+ from ....types import (
24
+ ChatCompletion,
25
+ ChatCompletionChunk,
26
+ CompletionChunk,
27
+ PytorchModelConfig,
28
+ )
24
29
  from ...utils import select_device
25
30
  from ..llm_family import LLMFamilyV1, LLMSpecV1
26
31
  from ..utils import (
@@ -52,6 +57,15 @@ class MiniCPMV26Model(PytorchChatModel):
52
57
  return True
53
58
  return False
54
59
 
60
+ def _sanitize_model_config(
61
+ self, pytorch_model_config: Optional[PytorchModelConfig]
62
+ ) -> PytorchModelConfig:
63
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
64
+ assert pytorch_model_config is not None
65
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
66
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
67
+ return pytorch_model_config
68
+
55
69
  def _get_model_class(self):
56
70
  from transformers import AutoModel
57
71
 
@@ -99,8 +113,13 @@ class MiniCPMV26Model(PytorchChatModel):
99
113
  self.model_path,
100
114
  trust_remote_code=True,
101
115
  )
116
+ min_pixels = self._pytorch_model_config.get("min_pixels")
117
+ max_pixels = self._pytorch_model_config.get("max_pixels")
102
118
  self._processor = AutoProcessor.from_pretrained(
103
- self.model_path, trust_remote_code=True
119
+ self.model_path,
120
+ trust_remote_code=True,
121
+ min_pixels=min_pixels,
122
+ max_pixels=max_pixels,
104
123
  )
105
124
  self._device = self._model.device
106
125
  self._save_tensorizer()
@@ -0,0 +1,308 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import base64
16
+ import importlib.util
17
+ import io
18
+ import logging
19
+ import sys
20
+ import time
21
+ import uuid
22
+ from typing import Dict, Iterator, List, Optional, Union
23
+
24
+ from ....model.utils import select_device
25
+ from ....types import (
26
+ ChatCompletion,
27
+ ChatCompletionAudio,
28
+ ChatCompletionChoice,
29
+ ChatCompletionChunk,
30
+ ChatCompletionMessage,
31
+ CompletionChunk,
32
+ CompletionUsage,
33
+ )
34
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
35
+ from ..utils import generate_completion_chunk
36
+ from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
37
+ from .utils import cache_clean
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ DEFAULT_SYSTEM_PROMPT = (
42
+ "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
43
+ "capable of perceiving auditory and visual inputs, as well as generating text and speech."
44
+ )
45
+
46
+
47
+ @register_transformer
48
+ @register_non_default_model("qwen2.5-omni")
49
+ class Qwen2_5OmniChatModel(PytorchChatModel):
50
+ def __init__(self, *args, **kwargs):
51
+ super().__init__(*args, **kwargs)
52
+
53
+ self._tokenizer = None
54
+ self._model = None
55
+ self._device = None
56
+ self._processor = None
57
+
58
+ @classmethod
59
+ def match(
60
+ cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
61
+ ) -> bool:
62
+ if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
63
+ return False
64
+ llm_family = model_family.model_family or model_family.model_name
65
+ if "qwen2.5-omni".lower() in llm_family.lower():
66
+ return True
67
+ return False
68
+
69
+ def load(self):
70
+ from transformers import (
71
+ Qwen2_5OmniForConditionalGeneration,
72
+ Qwen2_5OmniProcessor,
73
+ )
74
+
75
+ device = self._pytorch_model_config.get("device", "auto")
76
+ device = select_device(device)
77
+ self._device = device
78
+ # for multiple GPU, set back to auto to make multiple devices work
79
+ device = "auto" if device == "cuda" else device
80
+ flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
81
+ kwargs = (
82
+ {}
83
+ if not flash_attn_installed
84
+ else {"attn_implementation": "flash_attention_2"}
85
+ )
86
+ logger.debug("Loading model with extra kwargs: %s", kwargs)
87
+
88
+ self._processor = Qwen2_5OmniProcessor.from_pretrained(
89
+ self.model_path, trust_remote_code=True
90
+ )
91
+ self._tokenizer = self._processor.tokenizer
92
+ self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
93
+ self.model_path,
94
+ torch_dtype="auto",
95
+ device_map=device,
96
+ trust_remote_code=True,
97
+ **kwargs,
98
+ )
99
+
100
+ @cache_clean
101
+ def chat(
102
+ self,
103
+ messages: List[Dict],
104
+ generate_config: Optional[PytorchGenerateConfig] = None,
105
+ ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
106
+ messages = self._transform_messages(messages)
107
+
108
+ generate_config = generate_config if generate_config else {}
109
+
110
+ stream = generate_config.get("stream", False) if generate_config else False
111
+
112
+ if stream:
113
+ it = self._generate_stream(messages, generate_config)
114
+ return self._to_chat_completion_chunks(it)
115
+ else:
116
+ c = self._generate(messages, generate_config)
117
+ return c
118
+
119
+ def _transform_messages(
120
+ self,
121
+ messages: Union[List[ChatCompletionMessage], List[dict]],
122
+ ):
123
+ messages = super()._transform_messages(messages)
124
+ if messages[0]["role"] != "system":
125
+ messages.insert(
126
+ 0,
127
+ {
128
+ "role": "system",
129
+ "content": [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}], # type: ignore
130
+ },
131
+ )
132
+ else:
133
+ logger.debug("Force to set system prompt")
134
+ messages[0]["content"] = [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}] # type: ignore
135
+ return messages
136
+
137
+ def _generate(
138
+ self, messages: List, config: PytorchGenerateConfig = {}
139
+ ) -> ChatCompletion:
140
+ import soundfile as sf
141
+ from qwen_omni_utils import process_mm_info
142
+
143
+ use_audio_in_video = config.get("use_audio_in_video", True)
144
+ voice = config.get("voice", "Chelsie")
145
+
146
+ text = self._processor.apply_chat_template(
147
+ messages, tokenize=False, add_generation_prompt=True
148
+ )
149
+ audios, images, videos = process_mm_info(
150
+ messages, use_audio_in_video=use_audio_in_video
151
+ )
152
+ logger.debug(
153
+ "Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
154
+ )
155
+ inputs = self._processor(
156
+ text=text,
157
+ images=images,
158
+ audio=audios,
159
+ videos=videos,
160
+ padding=True,
161
+ return_tensors="pt",
162
+ use_audio_in_video=use_audio_in_video,
163
+ )
164
+ inputs = inputs.to(self._device)
165
+
166
+ # Inference: Generation of the output
167
+ generated_ids, audio = self._model.generate(
168
+ **inputs,
169
+ speaker=voice,
170
+ max_new_tokens=config.get("max_tokens", 512),
171
+ temperature=config.get("temperature", 1),
172
+ use_audio_in_video=use_audio_in_video,
173
+ )
174
+ generated_ids_trimmed = [
175
+ out_ids[len(in_ids) :]
176
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
177
+ ]
178
+ output_text = self._processor.batch_decode(
179
+ generated_ids_trimmed,
180
+ skip_special_tokens=True,
181
+ clean_up_tokenization_spaces=False,
182
+ )[0]
183
+
184
+ wav_io = io.BytesIO()
185
+ sf.write(
186
+ wav_io,
187
+ audio.reshape(-1).detach().cpu().numpy(),
188
+ samplerate=24000,
189
+ format="WAV",
190
+ )
191
+ wav_bytes = wav_io.getvalue()
192
+ audio_content = base64.b64encode(wav_bytes).decode()
193
+
194
+ return ChatCompletion(
195
+ id="chat" + str(uuid.uuid1()),
196
+ object="chat.completion",
197
+ created=int(time.time()),
198
+ model=self.model_uid,
199
+ choices=[
200
+ ChatCompletionChoice(
201
+ index=0,
202
+ message={
203
+ "role": "assistant",
204
+ "content": output_text,
205
+ "audio": ChatCompletionAudio(
206
+ id="audio" + str(uuid.uuid1()),
207
+ data=audio_content,
208
+ expires_at=int(time.time()),
209
+ transcript="",
210
+ ),
211
+ },
212
+ finish_reason="stop",
213
+ )
214
+ ],
215
+ usage=CompletionUsage(
216
+ prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
217
+ ),
218
+ )
219
+
220
+ def _generate_stream(
221
+ self, messages: List, config: PytorchGenerateConfig = {}
222
+ ) -> Iterator[CompletionChunk]:
223
+ from threading import Thread
224
+
225
+ from qwen_omni_utils import process_mm_info
226
+ from transformers import TextIteratorStreamer
227
+
228
+ use_audio_in_video = config.get("use_audio_in_video", True)
229
+ voice = config.get("voice", "Chelsie")
230
+
231
+ text = self._processor.apply_chat_template(
232
+ messages, tokenize=False, add_generation_prompt=True
233
+ )
234
+ audios, images, videos = process_mm_info(
235
+ messages, use_audio_in_video=use_audio_in_video
236
+ )
237
+ logger.debug(
238
+ "Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
239
+ )
240
+ inputs = self._processor(
241
+ text=text,
242
+ images=images,
243
+ audio=audios,
244
+ videos=videos,
245
+ padding=True,
246
+ return_tensors="pt",
247
+ use_audio_in_video=use_audio_in_video,
248
+ )
249
+ inputs = inputs.to(self._device)
250
+
251
+ tokenizer = self._tokenizer
252
+ streamer = TextIteratorStreamer(
253
+ tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
254
+ )
255
+
256
+ # TODO(xuye): Cannot find a way to streaming output,
257
+ # will implement it when it's supported
258
+
259
+ gen_kwargs = {
260
+ "max_new_tokens": config.get("max_tokens", 512),
261
+ "temperature": config.get("temperature", 1),
262
+ "streamer": streamer,
263
+ "speaker": voice,
264
+ **inputs,
265
+ }
266
+ error = None
267
+
268
+ def model_generate():
269
+ try:
270
+ return self._model.generate(**gen_kwargs)
271
+ except Exception:
272
+ nonlocal error
273
+ error = sys.exc_info()
274
+ streamer.end()
275
+ raise
276
+
277
+ thread = Thread(target=model_generate)
278
+ thread.start()
279
+
280
+ completion_id = str(uuid.uuid1())
281
+ for new_text in streamer:
282
+ yield generate_completion_chunk(
283
+ chunk_text=new_text,
284
+ finish_reason=None,
285
+ chunk_id=completion_id,
286
+ model_uid=self.model_uid,
287
+ prompt_tokens=-1,
288
+ completion_tokens=-1,
289
+ total_tokens=-1,
290
+ has_choice=True,
291
+ has_content=True,
292
+ )
293
+
294
+ if error:
295
+ _, err, tb = error # type: ignore
296
+ raise err.with_traceback(tb)
297
+
298
+ yield generate_completion_chunk(
299
+ chunk_text=None,
300
+ finish_reason="stop",
301
+ chunk_id=completion_id,
302
+ model_uid=self.model_uid,
303
+ prompt_tokens=-1,
304
+ completion_tokens=-1,
305
+ total_tokens=-1,
306
+ has_choice=True,
307
+ has_content=False,
308
+ )
@@ -74,7 +74,7 @@ class Qwen2AudioChatModel(PytorchChatModel):
74
74
 
75
75
  def _transform_messages(
76
76
  self,
77
- messages: List[ChatCompletionMessage],
77
+ messages: Union[List[ChatCompletionMessage], List[dict]],
78
78
  ):
79
79
  import librosa
80
80
 
@@ -24,15 +24,18 @@ from ....types import (
24
24
  ChatCompletionChunk,
25
25
  ChatCompletionMessage,
26
26
  CompletionChunk,
27
+ PytorchModelConfig,
27
28
  )
28
- from ..llm_family import LLMFamilyV1, LLMSpecV1
29
+ from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
29
30
  from ..utils import generate_chat_completion, generate_completion_chunk
30
- from .core import PytorchChatModel, PytorchGenerateConfig
31
+ from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
31
32
  from .utils import cache_clean
32
33
 
33
34
  logger = logging.getLogger(__name__)
34
35
 
35
36
 
37
+ @register_transformer
38
+ @register_non_default_model("qwen2-vl-instruct", "qwen2.5-vl-instruct")
36
39
  class Qwen2VLChatModel(PytorchChatModel):
37
40
  def __init__(self, *args, **kwargs):
38
41
  super().__init__(*args, **kwargs)
@@ -41,6 +44,15 @@ class Qwen2VLChatModel(PytorchChatModel):
41
44
  self._device = None
42
45
  self._processor = None
43
46
 
47
+ def _sanitize_model_config(
48
+ self, pytorch_model_config: Optional[PytorchModelConfig]
49
+ ) -> PytorchModelConfig:
50
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
51
+ assert pytorch_model_config is not None
52
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
53
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
54
+ return pytorch_model_config
55
+
44
56
  @classmethod
45
57
  def match(
46
58
  cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
@@ -69,9 +81,13 @@ class Qwen2VLChatModel(PytorchChatModel):
69
81
  self._device = device
70
82
  # for multiple GPU, set back to auto to make multiple devices work
71
83
  device = "auto" if device == "cuda" else device
72
-
84
+ min_pixels = self._pytorch_model_config.get("min_pixels")
85
+ max_pixels = self._pytorch_model_config.get("max_pixels")
73
86
  self._processor = AutoProcessor.from_pretrained(
74
- self.model_path, trust_remote_code=True
87
+ self.model_path,
88
+ trust_remote_code=True,
89
+ min_pixels=min_pixels,
90
+ max_pixels=max_pixels,
75
91
  )
76
92
  self._tokenizer = self._processor.tokenizer
77
93
  flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
@@ -31,6 +31,7 @@ from typing import (
31
31
  List,
32
32
  Optional,
33
33
  Tuple,
34
+ Union,
34
35
  cast,
35
36
  )
36
37
 
@@ -255,19 +256,26 @@ class ChatModelMixin:
255
256
  and choices
256
257
  and "delta" in choices[0]
257
258
  ):
258
- if reasoning_parser is not None:
259
- # process parsing reasoning content
260
- assert previous_texts is not None
259
+ if choices[0]["finish_reason"] is None:
260
+ if reasoning_parser is not None:
261
+ # process parsing reasoning content
262
+ assert previous_texts is not None
263
+ delta = choices[0]["delta"] # type: ignore
264
+ if text := delta.get("content"):
265
+ current_text = previous_texts[-1] + text
266
+ delta = reasoning_parser.extract_reasoning_content_streaming(
267
+ previous_text=previous_texts[-1],
268
+ current_text=current_text,
269
+ delta_text=text,
270
+ )
271
+ previous_texts[-1] = current_text
272
+ choices[0]["delta"] = delta # type: ignore
273
+ elif choices[0]["finish_reason"] is not None:
261
274
  delta = choices[0]["delta"] # type: ignore
262
- if text := delta.get("content"):
263
- current_text = previous_texts[-1] + text
264
- delta = reasoning_parser.extract_reasoning_content_streaming(
265
- previous_text=previous_texts[-1],
266
- current_text=current_text,
267
- delta_text=text,
268
- )
269
- previous_texts[-1] = current_text
270
- choices[0]["delta"] = delta # type: ignore
275
+ if "content" not in delta:
276
+ delta["content"] = "" # type: ignore
277
+ if reasoning_parser is not None:
278
+ delta["reasoning_content"] = None # type: ignore
271
279
  # Already a ChatCompletionChunk, we don't need to convert chunk.
272
280
  return cast(ChatCompletionChunk, chunk)
273
281
 
@@ -286,7 +294,11 @@ class ChatModelMixin:
286
294
  delta_text=choice["text"],
287
295
  )
288
296
  previous_texts[-1] = current_text
289
- if "tool_calls" in choice:
297
+ elif "text" in choice and choice["finish_reason"] is not None:
298
+ delta["content"] = choice["text"]
299
+ if reasoning_parser is not None:
300
+ delta["reasoning_content"] = None
301
+ elif "tool_calls" in choice:
290
302
  delta["tool_calls"] = choice["tool_calls"]
291
303
  choices_list.append(
292
304
  {
@@ -319,8 +331,9 @@ class ChatModelMixin:
319
331
  ) -> ChatCompletionChunk:
320
332
  choices_list = []
321
333
  for i, choice in enumerate(chunk["choices"]):
322
- delta = {"role": "assistant", "content": ""}
334
+ delta = ChatCompletionChunkDelta(role="assistant", content="")
323
335
  if reasoning_parser is not None:
336
+ delta["content"] = None
324
337
  delta["reasoning_content"] = ""
325
338
  choices_list.append(
326
339
  {
@@ -750,7 +763,7 @@ class ChatModelMixin:
750
763
 
751
764
  def _transform_messages(
752
765
  self,
753
- messages: List[ChatCompletionMessage],
766
+ messages: Union[List[ChatCompletionMessage], List[dict]],
754
767
  ):
755
768
  transformed_messages = []
756
769
  for msg in messages:
@@ -771,6 +784,15 @@ class ChatModelMixin:
771
784
  new_content.append(
772
785
  {"type": "video", "video": item["video_url"]["url"]}
773
786
  )
787
+ elif "audio_url" in item:
788
+ new_content.append(
789
+ {"type": "audio", "audio": item["audio_url"]["url"]}
790
+ )
791
+ else:
792
+ logger.warning(
793
+ "Unknown message type, message: %s, this message may be ignored",
794
+ messages,
795
+ )
774
796
  new_message = {"role": role, "content": new_content}
775
797
  transformed_messages.append(new_message)
776
798