xinference 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +462 -3
  3. xinference/client/restful/async_restful_client.py +158 -5
  4. xinference/client/restful/restful_client.py +131 -0
  5. xinference/core/supervisor.py +12 -0
  6. xinference/model/audio/model_spec.json +20 -20
  7. xinference/model/image/model_spec.json +159 -159
  8. xinference/model/llm/__init__.py +2 -2
  9. xinference/model/llm/llm_family.json +843 -180
  10. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  11. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  12. xinference/model/llm/sglang/core.py +20 -6
  13. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  14. xinference/model/llm/transformers/chatglm.py +3 -0
  15. xinference/model/llm/transformers/core.py +129 -36
  16. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  17. xinference/model/llm/transformers/utils.py +23 -0
  18. xinference/model/llm/utils.py +37 -24
  19. xinference/model/llm/vllm/core.py +128 -69
  20. xinference/model/utils.py +74 -31
  21. xinference/thirdparty/audiotools/core/audio_signal.py +6 -6
  22. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  23. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  24. xinference/types.py +9 -0
  25. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  26. xinference/ui/web/ui/build/index.html +1 -1
  27. xinference/ui/web/ui/build/static/js/{main.d192c4f3.js → main.45e78536.js} +3 -3
  28. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
  29. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  30. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/METADATA +7 -5
  31. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/RECORD +36 -35
  32. xinference/ui/web/ui/build/static/js/main.d192c4f3.js.map +0 -1
  33. xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +0 -1
  34. /xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -0
  35. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
  36. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
  37. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
  38. {xinference-1.10.1.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,340 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from concurrent.futures import ThreadPoolExecutor
16
+ from typing import Any, Dict, Iterator, List, Optional, Tuple
17
+
18
+ import torch
19
+ from PIL import Image
20
+
21
+ from .....core.model import register_batching_multimodal_models
22
+ from .....model.utils import select_device
23
+ from .....types import PytorchModelConfig
24
+ from ....scheduler.request import InferenceRequest
25
+ from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
26
+ from ...utils import _decode_image, parse_messages
27
+ from ..core import register_non_default_model
28
+ from .core import PytorchMultiModalModel
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ @register_batching_multimodal_models("MiniCPM-V-4.5")
34
+ @register_transformer
35
+ @register_non_default_model("MiniCPM-V-4.5")
36
+ class MiniCPMV45Model(PytorchMultiModalModel):
37
+ @classmethod
38
+ def match_json(
39
+ cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
40
+ ) -> bool:
41
+ family = model_family.model_family or model_family.model_name
42
+ if "MiniCPM-V-4.5".lower() in family.lower():
43
+ return True
44
+ return False
45
+
46
+ def _sanitize_model_config(
47
+ self, pytorch_model_config: Optional[PytorchModelConfig]
48
+ ) -> PytorchModelConfig:
49
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
50
+ assert pytorch_model_config is not None
51
+ # Configure pixel parameters for MiniCPM-V-4.5
52
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
53
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
54
+ return pytorch_model_config
55
+
56
+ def decide_device(self):
57
+ device = self._pytorch_model_config.get("device", "auto")
58
+ self._device = select_device(device)
59
+ self._device = (
60
+ "auto"
61
+ if self._device == "cuda" and self.quantization is None
62
+ else self._device
63
+ )
64
+
65
+ def load_processor(self):
66
+ from transformers import AutoProcessor, AutoTokenizer
67
+
68
+ min_pixels = self._pytorch_model_config.get("min_pixels")
69
+ max_pixels = self._pytorch_model_config.get("max_pixels")
70
+ self._processor = AutoProcessor.from_pretrained(
71
+ self.model_path,
72
+ trust_remote_code=True,
73
+ min_pixels=min_pixels,
74
+ max_pixels=max_pixels,
75
+ )
76
+
77
+ self._tokenizer = AutoTokenizer.from_pretrained(
78
+ self.model_path, trust_remote_code=True
79
+ )
80
+
81
+ def load_multimodal_model(self):
82
+ from transformers import AutoModel
83
+ from transformers.generation import GenerationConfig
84
+
85
+ if "int4" in self.model_path:
86
+ model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
87
+ else:
88
+ kwargs = self.apply_bnb_quantization()
89
+ model = AutoModel.from_pretrained(
90
+ self.model_path,
91
+ trust_remote_code=True,
92
+ torch_dtype=torch.float16,
93
+ device_map=self._device,
94
+ **kwargs,
95
+ )
96
+ self._model = model.eval()
97
+ # Specify hyperparameters for generation
98
+ self._model.generation_config = GenerationConfig.from_pretrained(
99
+ self.model_path,
100
+ trust_remote_code=True,
101
+ )
102
+ self._device = self._model.device
103
+
104
+ def _message_content_to_chat(self, content):
105
+ MAX_NUM_FRAMES = 64
106
+
107
+ def encode_video(video_path):
108
+ from decord import VideoReader, cpu
109
+
110
+ def uniform_sample(l, n):
111
+ gap = len(l) / n
112
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
113
+ return [l[i] for i in idxs]
114
+
115
+ vr = VideoReader(video_path, ctx=cpu(0))
116
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
117
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
118
+ if len(frame_idx) > MAX_NUM_FRAMES:
119
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
120
+ frames = vr.get_batch(frame_idx).asnumpy()
121
+ frames = [Image.fromarray(v.astype("uint8")) for v in frames]
122
+ logger.info(
123
+ f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
124
+ )
125
+ return frames
126
+
127
+ def _load_video(_url):
128
+ frames = None
129
+ if _url.startswith("data:"):
130
+ raise RuntimeError("Only video url format is supported")
131
+ else:
132
+ frames = encode_video(_url)
133
+ return frames
134
+
135
+ if not isinstance(content, str):
136
+ texts = []
137
+ image_urls = []
138
+ video_urls = []
139
+ for c in content:
140
+ c_type = c.get("type")
141
+ if c_type == "text":
142
+ texts.append(c["text"])
143
+ elif c_type == "image_url":
144
+ image_urls.append(c["image_url"]["url"])
145
+ elif c_type == "video_url":
146
+ video_urls.append(c["video_url"]["url"])
147
+ image_futures = []
148
+ with ThreadPoolExecutor() as executor:
149
+ for image_url in image_urls:
150
+ fut = executor.submit(_decode_image, image_url)
151
+ image_futures.append(fut)
152
+ images = [fut.result() for fut in image_futures]
153
+ frames = []
154
+ if len(video_urls) > 1:
155
+ raise RuntimeError("Only one video per message is supported")
156
+ for v in video_urls:
157
+ frames = _load_video(v)
158
+ text = " ".join(texts)
159
+ return text, images, frames
160
+ return content, [], []
161
+
162
+ def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
163
+ video_existed = False
164
+ prompt, _, chat_history = parse_messages(messages)
165
+
166
+ content, images_chat, video_frames = self._message_content_to_chat(prompt)
167
+ if len(video_frames) > 0:
168
+ video_existed = True
169
+ images_chat = video_frames
170
+
171
+ msgs = []
172
+ query_to_response: List[Dict] = []
173
+ for h in chat_history or []:
174
+ images_history = []
175
+ role = h["role"]
176
+ content_h, images_tmp, video_frames_h = self._message_content_to_chat(
177
+ h["content"]
178
+ )
179
+ if images_tmp != []:
180
+ images_history = images_tmp
181
+ if len(video_frames_h) > 0:
182
+ video_existed = True
183
+ images_history = video_frames_h
184
+ if len(query_to_response) == 0 and role == "user":
185
+ query_to_response.append(
186
+ {"role": "user", "content": images_history + [content_h]}
187
+ )
188
+ if len(query_to_response) == 1 and role == "assistant":
189
+ query_to_response.append(
190
+ {"role": "assistant", "content": images_history + [content_h]}
191
+ )
192
+ if len(query_to_response) == 2:
193
+ msgs.extend(query_to_response)
194
+ query_to_response = []
195
+ msgs.append({"role": "user", "content": images_chat + [content]})
196
+ return msgs, video_existed
197
+
198
+ def build_inputs_from_messages(
199
+ self,
200
+ messages: List[Dict],
201
+ generate_config: Dict,
202
+ ):
203
+ msgs, video_existed = self._convert_to_specific_style(messages)
204
+ # Set decode params for video
205
+ params = {}
206
+ if video_existed:
207
+ params = {"use_image_id": False, "max_slice_nums": 1}
208
+ return dict(msgs=msgs, image=None, **params)
209
+
210
+ def build_generate_kwargs(
211
+ self,
212
+ generate_config: Dict,
213
+ ) -> Dict[str, Any]:
214
+ return dict(**generate_config)
215
+
216
+ def build_streaming_iter(
217
+ self,
218
+ messages: List[Dict],
219
+ generate_config: Dict,
220
+ ) -> Tuple[Iterator, int]:
221
+ inputs = self.build_inputs_from_messages(messages, generate_config)
222
+ config = self.build_generate_kwargs(generate_config)
223
+ chat_iter = self._model.chat(
224
+ **inputs, **config, tokenizer=self._tokenizer, sampling=True
225
+ )
226
+
227
+ return chat_iter, -1
228
+
229
+ def prepare_sanitize_generate_config(self, req: InferenceRequest):
230
+ """
231
+ Refer to MiniCPM-V-4.5 documentation for generation parameters
232
+ """
233
+ raw_config = req.inference_kwargs.get("raw_params", {})
234
+ temperature = raw_config.get("temperature", None)
235
+ if temperature is None:
236
+ raw_config["temperature"] = 0.7
237
+ top_p = raw_config.get("top_p", None)
238
+ if top_p is None:
239
+ raw_config["top_p"] = 0.8
240
+ top_k = raw_config.get("top_k", None)
241
+ if top_k is None:
242
+ raw_config["top_k"] = 100
243
+ repetition_penalty = raw_config.get("repetition_penalty", None)
244
+ if repetition_penalty is None:
245
+ raw_config["repetition_penalty"] = 1.05
246
+ return raw_config
247
+
248
+ def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
249
+ """
250
+ Handle input IDs and images for MiniCPM-V-4.5
251
+ Based on MiniCPM-V-2.6 implementation with adaptations for 4.5
252
+ """
253
+ from copy import deepcopy
254
+
255
+ copy_msgs = deepcopy(msgs)
256
+
257
+ images = []
258
+ for i, msg in enumerate(copy_msgs):
259
+ role = msg["role"]
260
+ content = msg["content"]
261
+ assert role in ["user", "assistant"]
262
+ if i == 0:
263
+ assert role == "user", "The role of first msg should be user"
264
+ if isinstance(content, str):
265
+ content = [content]
266
+ cur_msgs = []
267
+ for c in content:
268
+ if isinstance(c, Image.Image):
269
+ images.append(c)
270
+ cur_msgs.append("(<image>./</image>)")
271
+ elif isinstance(c, str):
272
+ cur_msgs.append(c)
273
+ msg["content"] = "\n".join(cur_msgs)
274
+
275
+ return {
276
+ "prompt": self._processor.tokenizer.apply_chat_template(
277
+ copy_msgs, tokenize=False, add_generation_prompt=True
278
+ ),
279
+ "input_image": images,
280
+ }
281
+
282
+ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
283
+ msgs, video_existed = self._convert_to_specific_style(messages)
284
+ if video_existed:
285
+ raise RuntimeError(
286
+ f"Continuous batching does not support video inputs for this model: {self.model_uid}"
287
+ )
288
+ return self._handle_input_ids_and_images(msgs)
289
+
290
+ def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
291
+ prompts_lists = [x["prompt"] for x in prompts]
292
+ input_images_lists = [x["input_image"] for x in prompts]
293
+ inputs = self._processor(
294
+ prompts_lists,
295
+ input_images_lists,
296
+ max_slice_nums=None,
297
+ use_image_id=None,
298
+ return_tensors="pt",
299
+ max_length=8192,
300
+ ).to(self._model.device)
301
+ inputs.pop("image_sizes")
302
+
303
+ masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
304
+ for i in range(masked_input_ids.shape[0]):
305
+ non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
306
+ req_list[i].prompt_tokens = non_zero_values
307
+ req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
308
+ req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
309
+
310
+ model_inputs = {
311
+ "input_ids": inputs["input_ids"],
312
+ "image_bound": inputs["image_bound"],
313
+ "pixel_values": inputs["pixel_values"],
314
+ "tgt_sizes": inputs["tgt_sizes"],
315
+ }
316
+ model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
317
+
318
+ return {
319
+ "inputs_embeds": model_inputs["inputs_embeds"],
320
+ "attention_mask": inputs["attention_mask"],
321
+ }
322
+
323
+ def build_decode_position_ids(
324
+ self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
325
+ ):
326
+ return None
327
+
328
+ def batch_inference(self, req_list: List[InferenceRequest]):
329
+ """
330
+ This method is rewritten
331
+ because the specific inference process is performed by `self._model.llm`,
332
+ not `self._model` itself
333
+ """
334
+ from ..utils import batch_inference_one_step
335
+
336
+ self.prepare_batch_inference(req_list)
337
+ batch_inference_one_step(
338
+ self, req_list, self.model_uid, self._model.llm, self._tokenizer
339
+ )
340
+ self.handle_batch_inference_results(req_list)
@@ -281,11 +281,34 @@ def _batch_inference_one_step_internal(
281
281
  r.append_new_token(token)
282
282
 
283
283
  if decode_reqs:
284
+ # Ensure all decode requests have the same kv_cache reference
285
+ # This prevents batch size mismatches during merging
284
286
  decode_kv = decode_reqs[0].kv_cache
287
+
288
+ # Verify that all decode requests share the same kv_cache
289
+ for req in decode_reqs[1:]:
290
+ if req.kv_cache is not decode_kv:
291
+ logger.warning(
292
+ "Inconsistent kv_cache references detected in decode requests. "
293
+ "This may indicate a batching synchronization issue."
294
+ )
295
+ # Use the first decode_kv as the reference to maintain consistency
296
+ req.kv_cache = decode_kv
297
+
285
298
  # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
286
299
  merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
300
+ # Update sequence length information after KV cache merge
301
+ _, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
302
+ merged_kv_cache, xinf_model_obj
303
+ )
287
304
  for r in valid_req_list:
288
305
  r.kv_cache = merged_kv_cache
306
+ # Update attention mask sequence length to match merged KV cache
307
+ if "attention_mask_seq_len" in r.extra_kwargs:
308
+ # Ensure the attention mask length doesn't exceed the merged sequence length
309
+ r.extra_kwargs["attention_mask_seq_len"] = min(
310
+ r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
311
+ )
289
312
  empty_cache()
290
313
  else:
291
314
  for r in valid_req_list:
@@ -75,6 +75,8 @@ QWEN_TOOL_CALL_FAMILY = [
75
75
  "Qwen3-VL-Thinking",
76
76
  "Qwen3-Next-Instruct",
77
77
  "Qwen3-Next-Thinking",
78
+ "Qwen3-Omni-Instruct",
79
+ "Qwen3-Omni-Thinking",
78
80
  ]
79
81
 
80
82
  GLM4_TOOL_CALL_FAMILY = [
@@ -100,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
100
102
 
101
103
 
102
104
  class ChatModelMixin:
103
-
104
105
  def __init__(self):
105
106
  self.model_family = None
106
107
  self.model_uid = None
@@ -143,7 +144,7 @@ class ChatModelMixin:
143
144
  tokenize=False,
144
145
  **kwargs,
145
146
  ):
146
- if "vision" not in self.model_family.model_ability: # type: ignore
147
+ if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability: # type: ignore
147
148
  messages = self.convert_messages_with_content_list_to_str_conversion(
148
149
  messages
149
150
  )
@@ -186,8 +187,7 @@ class ChatModelMixin:
186
187
  return kwargs
187
188
  else:
188
189
  raise TypeError(
189
- f"`chat_template_kwargs` but be a JSON parsable str "
190
- f"or dict, got: {kwargs}"
190
+ f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
191
191
  )
192
192
  elif reasoning_parser and not reasoning_parser.enable_thinking:
193
193
  # hybrid model like qwen3,
@@ -853,11 +853,11 @@ class ChatModelMixin:
853
853
  "tool_calls": tool_calls,
854
854
  }
855
855
 
856
- try:
857
- usage = c.get("usage")
858
- assert "prompt_tokens" in usage
859
- except Exception:
856
+ # For tool completion chunks, use None for usage, actual values for stop
857
+ if finish_reason == "tool_calls":
860
858
  usage = None
859
+ else:
860
+ usage = c.get("usage")
861
861
  return {
862
862
  "id": "chat" + f"cmpl-{_id}",
863
863
  "model": model_uid,
@@ -882,25 +882,32 @@ class ChatModelMixin:
882
882
  ):
883
883
  if not self.tool_parser:
884
884
  return self._get_final_chat_completion_chunk(c)
885
- if self.reasoning_parser:
886
- c = self.reasoning_parser.prepare_reasoning_content(c)
885
+
887
886
  _id = str(uuid.uuid4())
888
887
  reasoning_content = None
888
+ content = ""
889
+
890
+ # First, process reasoning content if reasoning parser exists
891
+ text = c["choices"][0]["text"]
889
892
  if self.reasoning_parser and self.reasoning_parser.check_content_parser():
890
- text = c["choices"][0]["text"]
891
- reasoning_content, content = (
893
+ # Extract reasoning content directly from the original text
894
+ reasoning_content, processed_content = (
892
895
  self.reasoning_parser.extract_reasoning_content(text)
893
896
  )
894
- c["choices"][0]["text"] = content
897
+ # Use the processed content (without thinking tags) for tool parsing
898
+ if processed_content:
899
+ text = processed_content
895
900
 
901
+ # Then, extract tool calls from the processed text (without thinking tags)
896
902
  tool_calls = []
897
903
  failed_contents = []
898
904
  if isinstance(self.tool_parser, Glm4ToolParser):
899
905
  tool_result = self.tool_parser.extract_tool_calls(c)
900
906
  else:
901
- text = c["choices"][0]["text"]
902
907
  tool_result = self.tool_parser.extract_tool_calls(text)
903
- for content, func, args in tool_result:
908
+
909
+ # Process tool results
910
+ for tool_content, func, args in tool_result:
904
911
  if func:
905
912
  tool_calls.append(
906
913
  {
@@ -913,25 +920,31 @@ class ChatModelMixin:
913
920
  }
914
921
  )
915
922
  else:
916
- if content:
917
- failed_contents.append(content)
918
- finish_reason = "tool_calls" if tool_calls else "stop"
923
+ if tool_content:
924
+ failed_contents.append(tool_content)
919
925
 
920
- content = "".join(failed_contents) if failed_contents else None
926
+ # Determine the final content
927
+ if tool_calls:
928
+ # For tool calls, the main content should be empty or contain only non-tool parts
929
+ content = "".join(failed_contents) if failed_contents else ""
930
+ else:
931
+ # For non-tool calls, use the processed content from reasoning parser
932
+ content = text
933
+
934
+ finish_reason = "tool_calls" if tool_calls else "stop"
921
935
 
922
936
  m = {
923
937
  "role": "assistant",
924
- "content": content if content else "",
938
+ "content": content,
925
939
  "tool_calls": tool_calls,
926
940
  }
927
941
  # add only reasoning_content is None
928
942
  if reasoning_content is not None:
929
943
  m["reasoning_content"] = reasoning_content
930
944
 
931
- try:
932
- usage = c.get("usage")
933
- assert "prompt_tokens" in usage
934
- except Exception:
945
+ # For tool completion chunks, use actual usage values when available
946
+ usage = c.get("usage")
947
+ if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
935
948
  usage = {
936
949
  "prompt_tokens": -1,
937
950
  "completion_tokens": -1,