xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +107 -11
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/constants.py +5 -1
  5. xinference/core/media_interface.py +758 -0
  6. xinference/core/model.py +49 -9
  7. xinference/core/supervisor.py +1 -1
  8. xinference/core/utils.py +1 -1
  9. xinference/core/worker.py +33 -39
  10. xinference/deploy/cmdline.py +17 -0
  11. xinference/deploy/utils.py +0 -3
  12. xinference/model/audio/__init__.py +16 -27
  13. xinference/model/audio/core.py +2 -1
  14. xinference/model/audio/cosyvoice.py +4 -2
  15. xinference/model/audio/model_spec.json +63 -46
  16. xinference/model/audio/model_spec_modelscope.json +31 -14
  17. xinference/model/embedding/__init__.py +16 -24
  18. xinference/model/image/__init__.py +15 -25
  19. xinference/model/llm/__init__.py +40 -115
  20. xinference/model/llm/core.py +29 -6
  21. xinference/model/llm/llama_cpp/core.py +30 -347
  22. xinference/model/llm/llm_family.json +1674 -2203
  23. xinference/model/llm/llm_family.py +71 -7
  24. xinference/model/llm/llm_family_csghub.json +0 -32
  25. xinference/model/llm/llm_family_modelscope.json +1838 -2016
  26. xinference/model/llm/llm_family_openmind_hub.json +19 -325
  27. xinference/model/llm/lmdeploy/core.py +7 -2
  28. xinference/model/llm/mlx/core.py +23 -7
  29. xinference/model/llm/reasoning_parser.py +281 -5
  30. xinference/model/llm/sglang/core.py +39 -11
  31. xinference/model/llm/transformers/chatglm.py +9 -2
  32. xinference/model/llm/transformers/cogagent.py +10 -12
  33. xinference/model/llm/transformers/cogvlm2.py +6 -3
  34. xinference/model/llm/transformers/cogvlm2_video.py +3 -6
  35. xinference/model/llm/transformers/core.py +58 -60
  36. xinference/model/llm/transformers/deepseek_v2.py +4 -2
  37. xinference/model/llm/transformers/deepseek_vl.py +10 -4
  38. xinference/model/llm/transformers/deepseek_vl2.py +9 -4
  39. xinference/model/llm/transformers/gemma3.py +4 -5
  40. xinference/model/llm/transformers/glm4v.py +3 -21
  41. xinference/model/llm/transformers/glm_edge_v.py +3 -20
  42. xinference/model/llm/transformers/intern_vl.py +3 -6
  43. xinference/model/llm/transformers/internlm2.py +1 -1
  44. xinference/model/llm/transformers/minicpmv25.py +4 -2
  45. xinference/model/llm/transformers/minicpmv26.py +5 -3
  46. xinference/model/llm/transformers/omnilmm.py +1 -1
  47. xinference/model/llm/transformers/opt.py +1 -1
  48. xinference/model/llm/transformers/ovis2.py +302 -0
  49. xinference/model/llm/transformers/qwen-omni.py +8 -1
  50. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  51. xinference/model/llm/transformers/qwen2_vl.py +5 -1
  52. xinference/model/llm/transformers/qwen_vl.py +5 -2
  53. xinference/model/llm/utils.py +96 -45
  54. xinference/model/llm/vllm/core.py +108 -24
  55. xinference/model/llm/vllm/distributed_executor.py +8 -7
  56. xinference/model/llm/vllm/xavier/allocator.py +1 -1
  57. xinference/model/llm/vllm/xavier/block_manager.py +1 -1
  58. xinference/model/llm/vllm/xavier/block_tracker.py +3 -3
  59. xinference/model/llm/vllm/xavier/executor.py +1 -1
  60. xinference/model/llm/vllm/xavier/test/test_xavier.py +2 -11
  61. xinference/model/rerank/__init__.py +13 -24
  62. xinference/model/video/__init__.py +15 -25
  63. xinference/model/video/core.py +3 -3
  64. xinference/model/video/diffusers.py +157 -13
  65. xinference/model/video/model_spec.json +100 -0
  66. xinference/model/video/model_spec_modelscope.json +104 -0
  67. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  68. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  69. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  70. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  71. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  72. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  73. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  74. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  75. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  76. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  77. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  78. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  79. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  80. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  81. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  82. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  83. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  84. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  85. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  86. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  87. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  88. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  89. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  90. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  91. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  92. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  93. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  94. xinference/types.py +2 -71
  95. xinference/web/ui/build/asset-manifest.json +6 -6
  96. xinference/web/ui/build/index.html +1 -1
  97. xinference/web/ui/build/static/css/{main.0f6523be.css → main.337afe76.css} +2 -2
  98. xinference/web/ui/build/static/css/main.337afe76.css.map +1 -0
  99. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  100. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  101. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  102. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  103. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  104. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  105. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  106. xinference/web/ui/node_modules/.cache/babel-loader/5c08e2cd07809ed3e41486b16652253404cbb63a3ff8d0366ee50f57e2413cea.json +1 -0
  107. xinference/web/ui/node_modules/.cache/babel-loader/6798e126f3bc5f95a4c16a9c2ad52ffe77970c62406d83e20604dfda7ffd2247.json +1 -0
  108. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  109. xinference/web/ui/node_modules/.cache/babel-loader/b617f7d21a95045fc57b26a9373551740f1978a826134cbf705c3a1bf8714a93.json +1 -0
  110. xinference/web/ui/node_modules/.cache/babel-loader/c1506cb142151366074975f30fa1ff9cd6e5e978b62a4b074dfc16fe08d70d75.json +1 -0
  111. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +1 -0
  112. xinference/web/ui/src/locales/en.json +7 -4
  113. xinference/web/ui/src/locales/zh.json +7 -4
  114. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/METADATA +56 -36
  115. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/RECORD +120 -121
  116. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/WHEEL +1 -1
  117. xinference/core/image_interface.py +0 -377
  118. xinference/model/llm/transformers/compression.py +0 -258
  119. xinference/model/llm/transformers/yi_vl.py +0 -239
  120. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  121. xinference/web/ui/build/static/css/main.0f6523be.css.map +0 -1
  122. xinference/web/ui/build/static/js/main.4b67a723.js +0 -3
  123. xinference/web/ui/build/static/js/main.4b67a723.js.map +0 -1
  124. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  125. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +0 -1
  126. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/8f9af2979e45d4648f0cfae108363e58ee421c29a9d4e7329b6f06d9adfd4133.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/9c8b1a86e7c65b2b2599a205e30920652d6c2105f926508ef5bcf29a3ef4ce76.json +0 -1
  129. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +0 -1
  130. xinference/web/ui/node_modules/.cache/babel-loader/e4ba658c6b3b0490910acdae0c535a892257efb61539a24adf8038fc653bd22f.json +0 -1
  131. xinference/web/ui/node_modules/.cache/babel-loader/efe7cd132c27a8f9fd5352a394c491fd5fb0da0348cf9fcbd923164a32365eab.json +0 -1
  132. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  133. xinference/web/ui/node_modules/.cache/babel-loader/f199e8173f6409a5802ed44acb95f218388131136504b2e9132129e150c92f9a.json +0 -1
  134. /xinference/web/ui/build/static/js/{main.4b67a723.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  135. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/entry_points.txt +0 -0
  136. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/licenses/LICENSE +0 -0
  137. {xinference-1.5.0.post2.dist-info → xinference-1.6.0.dist-info}/top_level.txt +0 -0
@@ -42,6 +42,7 @@ from ...types import (
42
42
  ChatCompletion,
43
43
  ChatCompletionChoice,
44
44
  ChatCompletionChunk,
45
+ ChatCompletionChunkChoice,
45
46
  ChatCompletionChunkDelta,
46
47
  ChatCompletionMessage,
47
48
  Completion,
@@ -68,6 +69,11 @@ QWEN_TOOL_CALL_FAMILY = [
68
69
  "qwen2-moe-instruct",
69
70
  "qwen2.5-instruct",
70
71
  "qwen2.5-coder-instruct",
72
+ "XiYanSQL-QwenCoder-2504",
73
+ "QwQ-32B",
74
+ "qwen3",
75
+ "HuatuoGPT-o1-Qwen2.5",
76
+ "DianJin-R1",
71
77
  ]
72
78
 
73
79
  GLM4_TOOL_CALL_FAMILY = [
@@ -77,6 +83,7 @@ GLM4_TOOL_CALL_FAMILY = [
77
83
 
78
84
  LLAMA3_TOOL_CALL_FAMILY = [
79
85
  "llama-3.1-instruct",
86
+ "HuatuoGPT-o1-LLaMA-3.1",
80
87
  ]
81
88
 
82
89
  DEEPSEEK_TOOL_CALL_FAMILY = [
@@ -143,6 +150,7 @@ class ChatModelMixin:
143
150
  add_generation_prompt=True,
144
151
  **kwargs,
145
152
  )
153
+ logger.debug("Prompt: %s", full_context)
146
154
  return full_context
147
155
  except Exception as e:
148
156
  logger.warning(
@@ -154,6 +162,36 @@ class ChatModelMixin:
154
162
  # Compilation function uses a cache to avoid recompiling the same template
155
163
  return self._build_from_raw_template(messages, chat_template, **kwargs)
156
164
 
165
+ @staticmethod
166
+ def _get_chat_template_kwargs_from_generate_config(
167
+ generate_config: Optional[Union[dict, Any]],
168
+ reasoning_parser: Optional[ReasoningParser] = None,
169
+ ) -> Optional[dict]:
170
+ if reasoning_parser and not reasoning_parser.enable_thinking:
171
+ # hybrid model like qwen3,
172
+ # disabled thinking
173
+ return {"enable_thinking": False}
174
+ if not generate_config:
175
+ return None
176
+ if "chat_template_kwargs" in generate_config:
177
+ kwargs = generate_config["chat_template_kwargs"]
178
+ if isinstance(kwargs, str):
179
+ try:
180
+ return json.loads(kwargs)
181
+ except json.JSONDecodeError:
182
+ raise TypeError(
183
+ f"`chat_template_kwargs` should be json parsable, "
184
+ f"got: {kwargs}"
185
+ )
186
+ elif isinstance(kwargs, dict):
187
+ return kwargs
188
+ else:
189
+ raise TypeError(
190
+ f"`chat_template_kwargs` but be a JSON parsable str "
191
+ f"or dict, got: {kwargs}"
192
+ )
193
+ return None
194
+
157
195
  @staticmethod
158
196
  def convert_messages_with_content_list_to_str_conversion(
159
197
  messages: List[Dict],
@@ -257,7 +295,7 @@ class ChatModelMixin:
257
295
  and "delta" in choices[0]
258
296
  ):
259
297
  if choices[0]["finish_reason"] is None:
260
- if reasoning_parser is not None:
298
+ if reasoning_parser and reasoning_parser.check_content_parser():
261
299
  # process parsing reasoning content
262
300
  assert previous_texts is not None
263
301
  delta = choices[0]["delta"] # type: ignore
@@ -274,7 +312,7 @@ class ChatModelMixin:
274
312
  delta = choices[0]["delta"] # type: ignore
275
313
  if "content" not in delta:
276
314
  delta["content"] = "" # type: ignore
277
- if reasoning_parser is not None:
315
+ if reasoning_parser and reasoning_parser.check_content_parser():
278
316
  delta["reasoning_content"] = None # type: ignore
279
317
  # Already a ChatCompletionChunk, we don't need to convert chunk.
280
318
  return cast(ChatCompletionChunk, chunk)
@@ -283,7 +321,7 @@ class ChatModelMixin:
283
321
  for i, choice in enumerate(choices): # type: ignore
284
322
  delta = ChatCompletionChunkDelta()
285
323
  if "text" in choice and choice["finish_reason"] is None:
286
- if reasoning_parser is None:
324
+ if not reasoning_parser or not reasoning_parser.check_content_parser():
287
325
  delta["content"] = choice["text"]
288
326
  else:
289
327
  assert previous_texts is not None
@@ -296,7 +334,7 @@ class ChatModelMixin:
296
334
  previous_texts[-1] = current_text
297
335
  elif "text" in choice and choice["finish_reason"] is not None:
298
336
  delta["content"] = choice["text"]
299
- if reasoning_parser is not None:
337
+ if reasoning_parser and reasoning_parser.check_content_parser():
300
338
  delta["reasoning_content"] = None
301
339
  elif "tool_calls" in choice:
302
340
  delta["tool_calls"] = choice["tool_calls"]
@@ -310,7 +348,9 @@ class ChatModelMixin:
310
348
  assert choices is not None
311
349
  usage = (
312
350
  chunk["usage"]
313
- if choices[0]["finish_reason"] is not None and reasoning_parser is not None
351
+ if choices[0]["finish_reason"] is not None
352
+ and reasoning_parser
353
+ and reasoning_parser.check_content_parser()
314
354
  else None
315
355
  )
316
356
  chat_chunk = {
@@ -328,28 +368,32 @@ class ChatModelMixin:
328
368
  cls,
329
369
  chunk: CompletionChunk,
330
370
  reasoning_parser: Optional[ReasoningParser] = None,
331
- ) -> ChatCompletionChunk:
332
- choices_list = []
371
+ ) -> List[ChatCompletionChunk]:
372
+ choices_list: List[ChatCompletionChunkChoice] = []
373
+ chunks: List[ChatCompletionChunk] = []
333
374
  for i, choice in enumerate(chunk["choices"]):
334
375
  delta = ChatCompletionChunkDelta(role="assistant", content="")
335
- if reasoning_parser is not None:
376
+ if reasoning_parser and reasoning_parser.check_content_parser():
336
377
  delta["content"] = None
337
378
  delta["reasoning_content"] = ""
338
379
  choices_list.append(
339
- {
340
- "index": i,
341
- "delta": delta,
342
- "finish_reason": None,
343
- }
380
+ ChatCompletionChunkChoice(
381
+ index=i,
382
+ delta=delta,
383
+ finish_reason=None,
384
+ )
344
385
  )
345
- chat_chunk = {
346
- "id": "chat" + chunk["id"],
347
- "model": chunk["model"],
348
- "created": chunk["created"],
349
- "object": "chat.completion.chunk",
350
- "choices": choices_list,
351
- }
352
- return cast(ChatCompletionChunk, chat_chunk)
386
+ chat_chunk = ChatCompletionChunk(
387
+ id="chat" + chunk["id"],
388
+ model=chunk["model"],
389
+ created=chunk["created"],
390
+ object="chat.completion.chunk",
391
+ choices=choices_list,
392
+ )
393
+ chunks.append(chat_chunk)
394
+ if reasoning_parser:
395
+ chunks.extend(reasoning_parser.prepare_first_reasoning_content_chunk(chunk))
396
+ return chunks
353
397
 
354
398
  @classmethod
355
399
  def _get_final_chat_completion_chunk(
@@ -374,6 +418,8 @@ class ChatModelMixin:
374
418
  reasoning_parse: Optional[ReasoningParser] = None,
375
419
  ) -> Iterator[ChatCompletionChunk]:
376
420
  previous_texts = [""]
421
+ if reasoning_parse:
422
+ chunks = reasoning_parse.prepare_reasoning_content_sync(chunks)
377
423
  for _, chunk in enumerate(chunks):
378
424
  # usage
379
425
  choices = chunk.get("choices")
@@ -421,6 +467,9 @@ class ChatModelMixin:
421
467
  reasoning_parser: Optional[ReasoningParser] = None,
422
468
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
423
469
  previous_texts = [""]
470
+ # Process chunks
471
+ if reasoning_parser:
472
+ chunks = reasoning_parser.prepare_reasoning_content_streaming(chunks)
424
473
  async for chunk in chunks:
425
474
  choices = chunk.get("choices")
426
475
  if not choices:
@@ -436,19 +485,25 @@ class ChatModelMixin:
436
485
  def _to_chat_completion(
437
486
  completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
438
487
  ) -> ChatCompletion:
488
+ # prepare reasoning content
489
+ if reasoning_parser:
490
+ completion = reasoning_parser.prepare_reasoning_content(completion)
491
+
439
492
  if completion.get("object") == "chat.completion" and completion.get("choices"):
440
493
  # Already a ChatCompletion
441
- if reasoning_parser is not None:
442
- for choice in completion["choices"]:
443
- message = choice["message"] # type: ignore
444
- text = message["content"]
494
+ for choice in completion["choices"]:
495
+ message = choice["message"] # type: ignore
496
+ text = message["content"] # Original content from the message
497
+
498
+ if reasoning_parser and reasoning_parser.check_content_parser():
499
+ # Parse into reasoning and content parts
445
500
  (
446
- reasoning_content,
447
- content,
501
+ reasoning_val,
502
+ content_val,
448
503
  ) = reasoning_parser.extract_reasoning_content(text)
449
- message["content"] = content
450
- if reasoning_content is not None:
451
- message["reasoning_content"] = reasoning_content
504
+ message["content"] = content_val
505
+ if reasoning_val is not None:
506
+ message["reasoning_content"] = reasoning_val
452
507
  return cast(ChatCompletion, completion)
453
508
 
454
509
  choices = []
@@ -456,7 +511,7 @@ class ChatModelMixin:
456
511
  content = choice["text"]
457
512
  reasoning_content = None
458
513
 
459
- if reasoning_parser is not None:
514
+ if reasoning_parser and reasoning_parser.check_content_parser():
460
515
  reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
461
516
  choice
462
517
  )
@@ -653,20 +708,12 @@ class ChatModelMixin:
653
708
  failed_contents.append(content)
654
709
  finish_reason = "tool_calls" if tool_calls else "stop"
655
710
 
656
- reasoning_content = None
657
711
  content = ". ".join(failed_contents) if failed_contents else None
658
- if reasoning_parser is not None:
659
- reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
660
- content
661
- )
662
712
  d = {
663
713
  "role": "assistant",
664
714
  "content": content,
665
715
  "tool_calls": tool_calls,
666
716
  }
667
- # add only reasoning_content is None
668
- if reasoning_content is not None:
669
- d["reasoning_content"] = reasoning_content
670
717
 
671
718
  try:
672
719
  usage = c.get("usage")
@@ -701,7 +748,17 @@ class ChatModelMixin:
701
748
  c,
702
749
  reasoning_parser: Optional[ReasoningParser] = None,
703
750
  ):
751
+ if reasoning_parser:
752
+ c = reasoning_parser.prepare_reasoning_content(c)
704
753
  _id = str(uuid.uuid4())
754
+ reasoning_content = None
755
+ if reasoning_parser and reasoning_parser.check_content_parser():
756
+ text = c["choices"][0]["text"]
757
+ reasoning_content, content = reasoning_parser.extract_reasoning_content(
758
+ text
759
+ )
760
+ c["choices"][0]["text"] = content
761
+
705
762
  tool_result = cls._eval_tool_arguments(model_family, c)
706
763
 
707
764
  tool_calls = []
@@ -722,12 +779,6 @@ class ChatModelMixin:
722
779
  failed_contents.append(content)
723
780
  finish_reason = "tool_calls" if tool_calls else "stop"
724
781
 
725
- reasoning_content = None
726
- content = ". ".join(failed_contents) if failed_contents else None
727
- if reasoning_parser is not None:
728
- reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
729
- content
730
- )
731
782
  m = {
732
783
  "role": "assistant",
733
784
  "content": content,
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import asyncio
16
+ import importlib.util
16
17
  import itertools
17
18
  import json
18
19
  import logging
@@ -50,7 +51,7 @@ from ....types import (
50
51
  LoRA,
51
52
  )
52
53
  from .. import LLM, LLMFamilyV1, LLMSpecV1
53
- from ..llm_family import CustomLLMFamilyV1
54
+ from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
54
55
  from ..utils import (
55
56
  DEEPSEEK_TOOL_CALL_FAMILY,
56
57
  QWEN_TOOL_CALL_FAMILY,
@@ -169,6 +170,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
169
170
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
170
171
  VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
171
172
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
173
+ VLLM_SUPPORTED_CHAT_MODELS.append("XiYanSQL-QwenCoder-2504")
172
174
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
173
175
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
174
176
  VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
@@ -176,6 +178,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
176
178
  VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
177
179
  VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
178
180
  VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
181
+ VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1")
182
+ VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-Qwen2.5")
183
+ VLLM_SUPPORTED_CHAT_MODELS.append("DianJin-R1")
179
184
 
180
185
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
181
186
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -206,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
206
211
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
207
212
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
208
213
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
214
+ VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
209
215
 
210
216
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
211
217
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -239,6 +245,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.0":
239
245
  if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
240
246
  VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
241
247
 
248
+ if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
249
+ VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
250
+
242
251
 
243
252
  class VLLMModel(LLM):
244
253
  def __init__(
@@ -330,8 +339,10 @@ class VLLMModel(LLM):
330
339
 
331
340
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
332
341
 
333
- if vllm.__version__ >= "0.3.1":
334
- # from vllm v0.3.1, it uses cupy as NCCL backend
342
+ from ..llm_family import LlamaCppLLMSpecV1
343
+
344
+ if "0.3.1" <= vllm.__version__ <= "0.3.3":
345
+ # from vllm v0.3.1 to v0.3.3, it uses cupy as NCCL backend
335
346
  # in which cupy will fork a process
336
347
  # only for xoscar >= 0.3.0, new process is allowed in subpool
337
348
  # besides, xinference set start method as forkserver for unix
@@ -341,8 +352,17 @@ class VLLMModel(LLM):
341
352
  self._device_count = self._get_cuda_count()
342
353
  self._model_config = self._sanitize_model_config(self._model_config)
343
354
  reasoning_content = self._model_config.pop("reasoning_content")
355
+ enable_thinking = self._model_config.pop("enable_thinking", False)
356
+ self.prepare_parse_reasoning_content(
357
+ reasoning_content, enable_thinking=enable_thinking
358
+ )
344
359
 
345
- self.prepare_parse_reasoning_content(reasoning_content)
360
+ if (
361
+ isinstance(self.model_spec, LlamaCppLLMSpecV1)
362
+ and self.model_spec.model_format == "ggufv2"
363
+ ):
364
+ # gguf
365
+ self._preprocess_load_gguf()
346
366
 
347
367
  if self.lora_modules is None:
348
368
  self.lora_requests = []
@@ -482,6 +502,45 @@ class VLLMModel(LLM):
482
502
  _, err, tb = self._loading_error
483
503
  raise err.with_traceback(tb)
484
504
 
505
+ def _preprocess_load_gguf(self):
506
+ # check if it is multi gguf files
507
+ if (
508
+ not os.path.isfile(self.model_path)
509
+ and self.model_spec.quantization_parts
510
+ and self.quantization in self.model_spec.quantization_parts
511
+ ):
512
+ raise RuntimeError(
513
+ "vllm does not support multiple gguf files, please merge them first and "
514
+ "provide `model_path` with merged file"
515
+ )
516
+
517
+ if "tokenizer" not in self._model_config:
518
+ # find pytorch format without quantization
519
+ non_quant_spec = next(
520
+ spec
521
+ for spec in self.model_family.model_specs
522
+ if spec.model_format == "pytorch"
523
+ and "none" in spec.quantizations
524
+ and spec.model_size_in_billions
525
+ == self.model_spec.model_size_in_billions
526
+ )
527
+
528
+ path = cache_model_tokenizer_and_config(self.model_family, non_quant_spec)
529
+ # other than gguf file, vllm requires to provide tokenizer and hf_config_path
530
+ self._model_config["tokenizer"] = self._model_config[
531
+ "hf_config_path"
532
+ ] = path
533
+
534
+ if not os.path.isfile(self.model_path):
535
+ self.model_path = os.path.realpath(
536
+ os.path.join(
537
+ self.model_path,
538
+ self.model_spec.model_file_name_template.format(
539
+ quantization=self.quantization
540
+ ),
541
+ )
542
+ )
543
+
485
544
  def stop(self):
486
545
  # though the vLLM engine will shutdown when deleted,
487
546
  # but some issue e.g. GH#1682 reported
@@ -642,7 +701,11 @@ class VLLMModel(LLM):
642
701
  return sanitized
643
702
 
644
703
  @classmethod
645
- def match(
704
+ def check_lib(cls) -> bool:
705
+ return importlib.util.find_spec("vllm") is not None
706
+
707
+ @classmethod
708
+ def match_json(
646
709
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
647
710
  ) -> bool:
648
711
  if not cls._has_cuda_device():
@@ -755,10 +818,6 @@ class VLLMModel(LLM):
755
818
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
756
819
 
757
820
  sanitized_generate_config = self._sanitize_generate_config(generate_config)
758
- if self.reasoning_parser:
759
- # For reasoning model, the </think> we be split into multiple words,
760
- # if `stop` param is passed, so we pop it from config.
761
- sanitized_generate_config.pop("stop")
762
821
  logger.debug(
763
822
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
764
823
  )
@@ -935,10 +994,10 @@ class VLLMModel(LLM):
935
994
 
936
995
  class VLLMChatModel(VLLMModel, ChatModelMixin):
937
996
  @classmethod
938
- def match(
997
+ def match_json(
939
998
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
940
999
  ) -> bool:
941
- if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
1000
+ if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
942
1001
  return False
943
1002
  if llm_spec.model_format == "pytorch":
944
1003
  if quantization != "none" and not (quantization is None):
@@ -954,6 +1013,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
954
1013
  else:
955
1014
  if "4" not in quantization:
956
1015
  return False
1016
+ if llm_spec.model_format == "ggufv2":
1017
+ if not (VLLM_INSTALLED and vllm.__version__ >= "0.8.2"):
1018
+ return False
957
1019
  if isinstance(llm_family, CustomLLMFamilyV1):
958
1020
  if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
959
1021
  return False
@@ -970,13 +1032,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
970
1032
  ) -> Dict:
971
1033
  if not generate_config:
972
1034
  generate_config = {}
973
- if not generate_config.get("stop") and self.model_family.stop:
974
- generate_config["stop"] = self.model_family.stop.copy()
975
- if (
976
- not generate_config.get("stop_token_ids")
977
- and self.model_family.stop_token_ids
978
- ):
979
- generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
1035
+ if "reasoning" in getattr(self.model_family, "model_ability", []):
1036
+ generate_config.pop("stop", None)
1037
+ generate_config.pop("stop_token_ids", None)
1038
+ else:
1039
+ if not generate_config.get("stop") and self.model_family.stop:
1040
+ generate_config["stop"] = self.model_family.stop.copy()
1041
+ if (
1042
+ not generate_config.get("stop_token_ids")
1043
+ and self.model_family.stop_token_ids
1044
+ ):
1045
+ generate_config[
1046
+ "stop_token_ids"
1047
+ ] = self.model_family.stop_token_ids.copy()
980
1048
  return generate_config
981
1049
 
982
1050
  @staticmethod
@@ -988,11 +1056,15 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
988
1056
  chunks: AsyncGenerator[CompletionChunk, None],
989
1057
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
990
1058
  i = 0
1059
+ previous_texts = [""]
1060
+ if self.reasoning_parser:
1061
+ chunks = self.reasoning_parser.prepare_reasoning_content(chunks)
991
1062
  async for chunk in chunks:
992
1063
  if i == 0:
993
- yield self._get_first_chat_completion_chunk(
1064
+ for first_chunk in self._get_first_chat_completion_chunk(
994
1065
  chunk, self.reasoning_parser
995
- )
1066
+ ):
1067
+ yield first_chunk
996
1068
  # usage
997
1069
  choices = chunk.get("choices")
998
1070
  if not choices:
@@ -1006,7 +1078,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1006
1078
  reasoning_parser=self.reasoning_parser,
1007
1079
  )
1008
1080
  else:
1009
- yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
1081
+ yield self._to_chat_completion_chunk(
1082
+ chunk, self.reasoning_parser, previous_texts
1083
+ )
1010
1084
  i += 1
1011
1085
 
1012
1086
  @vllm_check
@@ -1018,7 +1092,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1018
1092
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
1019
1093
  tools = generate_config.pop("tools", []) if generate_config else None
1020
1094
  model_family = self.model_family.model_family or self.model_family.model_name
1021
- full_context_kwargs = {}
1095
+ full_context_kwargs = (
1096
+ self._get_chat_template_kwargs_from_generate_config(
1097
+ generate_config, self.reasoning_parser
1098
+ )
1099
+ or {}
1100
+ )
1022
1101
  if tools:
1023
1102
  if (
1024
1103
  model_family in QWEN_TOOL_CALL_FAMILY
@@ -1055,7 +1134,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1055
1134
 
1056
1135
  class VLLMVisionModel(VLLMModel, ChatModelMixin):
1057
1136
  @classmethod
1058
- def match(
1137
+ def match_json(
1059
1138
  cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
1060
1139
  ) -> bool:
1061
1140
  if not cls._has_cuda_device():
@@ -1136,7 +1215,12 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1136
1215
  if "internvl2" not in model_family.lower():
1137
1216
  from qwen_vl_utils import process_vision_info
1138
1217
 
1139
- full_context_kwargs = {}
1218
+ full_context_kwargs = (
1219
+ self._get_chat_template_kwargs_from_generate_config(
1220
+ generate_config, self.reasoning_parser
1221
+ )
1222
+ or {}
1223
+ )
1140
1224
  if tools and model_family in QWEN_TOOL_CALL_FAMILY:
1141
1225
  full_context_kwargs["tools"] = tools
1142
1226
  assert self.model_family.chat_template is not None
@@ -54,13 +54,14 @@ class WorkerActor(xo.StatelessActor):
54
54
  return f"VllmWorker_{rank}"
55
55
 
56
56
  def execute_method(self, method: Union[str, Callable], *args, **kwargs):
57
- logger.debug(
58
- "Calling method %s in vllm worker %s, args: %s, kwargs: %s",
59
- method,
60
- self.uid,
61
- args,
62
- kwargs,
63
- )
57
+ # NOTE: too many logs, but useful for debug
58
+ # logger.debug(
59
+ # "Calling method %s in vllm worker %s, args: %s, kwargs: %s",
60
+ # method,
61
+ # self.uid,
62
+ # args,
63
+ # kwargs,
64
+ # )
64
65
  if isinstance(method, str):
65
66
  return getattr(self._worker, method)(*args, **kwargs)
66
67
  else:
@@ -24,7 +24,7 @@ from .block import XavierPrefixCachingBlockAllocator
24
24
  class XavierCpuGpuBlockAllocator(CpuGpuBlockAllocator):
25
25
  def __init__(self, *args, **kwargs):
26
26
  super().__init__(*args, **kwargs)
27
- self._xavier_config: Optional[Dict[str, Any]] = None
27
+ self._xavier_config: Optional[Dict[str, Any]] = None # type: ignore
28
28
 
29
29
  @property
30
30
  def xavier_config(self):
@@ -30,7 +30,7 @@ class XavierBlockManager(SelfAttnBlockSpaceManager):
30
30
  # Monkey patch
31
31
  CpuGpuBlockAllocator.create = XavierCpuGpuBlockAllocator.create
32
32
  super().__init__(*args, **kwargs)
33
- self._xavier_config: Optional[Dict[str, Any]] = None
33
+ self._xavier_config: Optional[Dict[str, Any]] = None # type: ignore
34
34
  logger.debug("Init xavier block manager done.")
35
35
 
36
36
  @property
@@ -25,10 +25,10 @@ class VLLMBlockTracker(xo.StatelessActor):
25
25
  def __init__(self):
26
26
  super().__init__()
27
27
  # engine -> hash -> (rank, block_id)
28
- self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
28
+ self._hash_to_rank_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {} # type: ignore
29
29
  # engine -> rank -> (hash, block_id)
30
- self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {}
31
- self._unavailable_ranks: Set[int] = set()
30
+ self._rank_to_hash_and_block_id: Dict[int, Dict[int, Set[Tuple[int, int]]]] = {} # type: ignore
31
+ self._unavailable_ranks: Set[int] = set() # type: ignore
32
32
 
33
33
  def register_blocks(
34
34
  self, virtual_engine: int, block_infos: List[Tuple[int, int]], rank: int
@@ -38,7 +38,7 @@ class XavierExecutor(MultiprocessingDistributedExecutor):
38
38
  Retrieve the necessary transmission information from the `cache_engine`.
39
39
  """
40
40
  transfer_ref = await self._get_transfer_ref()
41
- ref_cache_engine: CacheEngine = self.driver_worker.cache_engine[0]
41
+ ref_cache_engine: CacheEngine = self.driver_worker.cache_engine[0] # type: ignore
42
42
  buffer_dtype = ref_cache_engine.dtype
43
43
  buffer_device = "cpu"
44
44
  buffer_pin_memory = is_pin_memory_available()
@@ -11,8 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import os
15
- import sys
16
14
 
17
15
  import pytest
18
16
  import xoscar as xo
@@ -30,14 +28,7 @@ class ExtendedBlockTracker(VLLMBlockTracker):
30
28
 
31
29
  @pytest.fixture
32
30
  async def actor_pool_context():
33
- start_method = (
34
- os.environ.get("POOL_START_METHOD", "forkserver")
35
- if sys.platform != "win32"
36
- else None
37
- )
38
- pool = await xo.create_actor_pool(
39
- "127.0.0.1", n_process=2, subprocess_start_method=start_method
40
- )
31
+ pool = await xo.create_actor_pool("127.0.0.1", n_process=2)
41
32
  async with pool:
42
33
  yield pool
43
34
 
@@ -46,7 +37,7 @@ async def actor_pool_context():
46
37
  async def test_block_tracker(actor_pool_context):
47
38
  actor_pool = actor_pool_context
48
39
  addr = actor_pool.external_address
49
- tracker_ref: xo.ActorRefType[ExtendedBlockTracker] = await xo.create_actor(
40
+ tracker_ref: xo.ActorRefType[ExtendedBlockTracker] = await xo.create_actor( # type: ignore
50
41
  ExtendedBlockTracker,
51
42
  address=addr,
52
43
  uid=VLLMBlockTracker.default_uid(),