xinference 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +97 -8
  3. xinference/client/restful/restful_client.py +51 -11
  4. xinference/core/media_interface.py +758 -0
  5. xinference/core/model.py +49 -9
  6. xinference/core/worker.py +31 -37
  7. xinference/deploy/utils.py +0 -3
  8. xinference/model/audio/__init__.py +16 -27
  9. xinference/model/audio/core.py +1 -0
  10. xinference/model/audio/cosyvoice.py +4 -2
  11. xinference/model/audio/model_spec.json +20 -3
  12. xinference/model/audio/model_spec_modelscope.json +18 -1
  13. xinference/model/embedding/__init__.py +16 -24
  14. xinference/model/image/__init__.py +15 -25
  15. xinference/model/llm/__init__.py +37 -110
  16. xinference/model/llm/core.py +15 -6
  17. xinference/model/llm/llama_cpp/core.py +25 -353
  18. xinference/model/llm/llm_family.json +613 -89
  19. xinference/model/llm/llm_family.py +9 -1
  20. xinference/model/llm/llm_family_modelscope.json +540 -90
  21. xinference/model/llm/mlx/core.py +6 -3
  22. xinference/model/llm/reasoning_parser.py +281 -5
  23. xinference/model/llm/sglang/core.py +16 -3
  24. xinference/model/llm/transformers/chatglm.py +2 -2
  25. xinference/model/llm/transformers/cogagent.py +1 -1
  26. xinference/model/llm/transformers/cogvlm2.py +1 -1
  27. xinference/model/llm/transformers/core.py +9 -3
  28. xinference/model/llm/transformers/glm4v.py +1 -1
  29. xinference/model/llm/transformers/minicpmv26.py +1 -1
  30. xinference/model/llm/transformers/qwen-omni.py +6 -0
  31. xinference/model/llm/transformers/qwen_vl.py +1 -1
  32. xinference/model/llm/utils.py +68 -45
  33. xinference/model/llm/vllm/core.py +38 -18
  34. xinference/model/llm/vllm/xavier/test/test_xavier.py +1 -10
  35. xinference/model/rerank/__init__.py +13 -24
  36. xinference/model/video/__init__.py +15 -25
  37. xinference/model/video/core.py +3 -3
  38. xinference/model/video/diffusers.py +133 -16
  39. xinference/model/video/model_spec.json +54 -0
  40. xinference/model/video/model_spec_modelscope.json +56 -0
  41. xinference/thirdparty/cosyvoice/bin/average_model.py +5 -4
  42. xinference/thirdparty/cosyvoice/bin/export_jit.py +50 -20
  43. xinference/thirdparty/cosyvoice/bin/export_onnx.py +136 -51
  44. xinference/thirdparty/cosyvoice/bin/inference.py +15 -5
  45. xinference/thirdparty/cosyvoice/bin/train.py +7 -2
  46. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +72 -52
  47. xinference/thirdparty/cosyvoice/cli/frontend.py +58 -58
  48. xinference/thirdparty/cosyvoice/cli/model.py +140 -155
  49. xinference/thirdparty/cosyvoice/dataset/processor.py +9 -5
  50. xinference/thirdparty/cosyvoice/flow/decoder.py +656 -54
  51. xinference/thirdparty/cosyvoice/flow/flow.py +69 -11
  52. xinference/thirdparty/cosyvoice/flow/flow_matching.py +167 -63
  53. xinference/thirdparty/cosyvoice/flow/length_regulator.py +1 -0
  54. xinference/thirdparty/cosyvoice/hifigan/discriminator.py +91 -1
  55. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +4 -1
  56. xinference/thirdparty/cosyvoice/hifigan/generator.py +4 -1
  57. xinference/thirdparty/cosyvoice/hifigan/hifigan.py +2 -2
  58. xinference/thirdparty/cosyvoice/llm/llm.py +198 -18
  59. xinference/thirdparty/cosyvoice/transformer/embedding.py +12 -4
  60. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +124 -21
  61. xinference/thirdparty/cosyvoice/utils/class_utils.py +13 -0
  62. xinference/thirdparty/cosyvoice/utils/common.py +1 -1
  63. xinference/thirdparty/cosyvoice/utils/file_utils.py +40 -2
  64. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +7 -0
  65. xinference/thirdparty/cosyvoice/utils/mask.py +4 -0
  66. xinference/thirdparty/cosyvoice/utils/train_utils.py +5 -1
  67. xinference/thirdparty/matcha/hifigan/xutils.py +3 -3
  68. xinference/types.py +0 -71
  69. xinference/web/ui/build/asset-manifest.json +3 -3
  70. xinference/web/ui/build/index.html +1 -1
  71. xinference/web/ui/build/static/js/main.ae579a97.js +3 -0
  72. xinference/web/ui/build/static/js/main.ae579a97.js.map +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/0196a4b09e3264614e54360d5f832c46b31d964ec58296765ebff191ace6adbf.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/12e02ee790dbf57ead09a241a93bb5f893393aa36628ca741d44390e836a103f.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/18fa271456b31cded36c05c4c71c6b2b1cf4e4128c1e32f0e45d8b9f21764397.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/2fdc61dcb6a9d1fbcb44be592d0e87d8c3f21297a7327559ef5345665f8343f7.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/3d596a3e8dd6430d7ce81d164e32c31f8d47cfa5f725c328a298754d78563e14.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/8472e58a31720892d534f3febda31f746b25ec4aa60787eef34217b074e67965.json +1 -0
  79. xinference/web/ui/src/locales/en.json +6 -4
  80. xinference/web/ui/src/locales/zh.json +6 -4
  81. {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/METADATA +59 -39
  82. {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/RECORD +87 -87
  83. {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/WHEEL +1 -1
  84. xinference/core/image_interface.py +0 -377
  85. xinference/thirdparty/cosyvoice/bin/export_trt.sh +0 -9
  86. xinference/web/ui/build/static/js/main.91e77b5c.js +0 -3
  87. xinference/web/ui/build/static/js/main.91e77b5c.js.map +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/0f0adb2283a8f469d097a7a0ebb754624fa52414c83b83696c41f2e6a737ceda.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/5e6edb0fb87e3798f142e9abf8dd2dc46bab33a60d31dff525797c0c99887097.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/6087820be1bd5c02c42dff797e7df365448ef35ab26dd5d6bd33e967e05cbfd4.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/8157db83995c671eb57abc316c337f867d1dc63fb83520bb4ff351fee57dcce2.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/f04f666b77b44d7be3e16034d6b0074de2ba9c254f1fae15222b3148608fa8b3.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.91e77b5c.js.LICENSE.txt → main.ae579a97.js.LICENSE.txt} +0 -0
  94. {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/entry_points.txt +0 -0
  95. {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/licenses/LICENSE +0 -0
  96. {xinference-1.5.1.dist-info → xinference-1.6.0.post1.dist-info}/top_level.txt +0 -0
@@ -42,6 +42,7 @@ from ...types import (
42
42
  ChatCompletion,
43
43
  ChatCompletionChoice,
44
44
  ChatCompletionChunk,
45
+ ChatCompletionChunkChoice,
45
46
  ChatCompletionChunkDelta,
46
47
  ChatCompletionMessage,
47
48
  Completion,
@@ -68,8 +69,11 @@ QWEN_TOOL_CALL_FAMILY = [
68
69
  "qwen2-moe-instruct",
69
70
  "qwen2.5-instruct",
70
71
  "qwen2.5-coder-instruct",
72
+ "XiYanSQL-QwenCoder-2504",
71
73
  "QwQ-32B",
72
74
  "qwen3",
75
+ "HuatuoGPT-o1-Qwen2.5",
76
+ "DianJin-R1",
73
77
  ]
74
78
 
75
79
  GLM4_TOOL_CALL_FAMILY = [
@@ -79,6 +83,7 @@ GLM4_TOOL_CALL_FAMILY = [
79
83
 
80
84
  LLAMA3_TOOL_CALL_FAMILY = [
81
85
  "llama-3.1-instruct",
86
+ "HuatuoGPT-o1-LLaMA-3.1",
82
87
  ]
83
88
 
84
89
  DEEPSEEK_TOOL_CALL_FAMILY = [
@@ -160,7 +165,12 @@ class ChatModelMixin:
160
165
  @staticmethod
161
166
  def _get_chat_template_kwargs_from_generate_config(
162
167
  generate_config: Optional[Union[dict, Any]],
168
+ reasoning_parser: Optional[ReasoningParser] = None,
163
169
  ) -> Optional[dict]:
170
+ if reasoning_parser and not reasoning_parser.enable_thinking:
171
+ # hybrid model like qwen3,
172
+ # disabled thinking
173
+ return {"enable_thinking": False}
164
174
  if not generate_config:
165
175
  return None
166
176
  if "chat_template_kwargs" in generate_config:
@@ -285,7 +295,7 @@ class ChatModelMixin:
285
295
  and "delta" in choices[0]
286
296
  ):
287
297
  if choices[0]["finish_reason"] is None:
288
- if reasoning_parser is not None:
298
+ if reasoning_parser and reasoning_parser.check_content_parser():
289
299
  # process parsing reasoning content
290
300
  assert previous_texts is not None
291
301
  delta = choices[0]["delta"] # type: ignore
@@ -302,7 +312,7 @@ class ChatModelMixin:
302
312
  delta = choices[0]["delta"] # type: ignore
303
313
  if "content" not in delta:
304
314
  delta["content"] = "" # type: ignore
305
- if reasoning_parser is not None:
315
+ if reasoning_parser and reasoning_parser.check_content_parser():
306
316
  delta["reasoning_content"] = None # type: ignore
307
317
  # Already a ChatCompletionChunk, we don't need to convert chunk.
308
318
  return cast(ChatCompletionChunk, chunk)
@@ -311,7 +321,7 @@ class ChatModelMixin:
311
321
  for i, choice in enumerate(choices): # type: ignore
312
322
  delta = ChatCompletionChunkDelta()
313
323
  if "text" in choice and choice["finish_reason"] is None:
314
- if reasoning_parser is None:
324
+ if not reasoning_parser or not reasoning_parser.check_content_parser():
315
325
  delta["content"] = choice["text"]
316
326
  else:
317
327
  assert previous_texts is not None
@@ -324,7 +334,7 @@ class ChatModelMixin:
324
334
  previous_texts[-1] = current_text
325
335
  elif "text" in choice and choice["finish_reason"] is not None:
326
336
  delta["content"] = choice["text"]
327
- if reasoning_parser is not None:
337
+ if reasoning_parser and reasoning_parser.check_content_parser():
328
338
  delta["reasoning_content"] = None
329
339
  elif "tool_calls" in choice:
330
340
  delta["tool_calls"] = choice["tool_calls"]
@@ -338,7 +348,9 @@ class ChatModelMixin:
338
348
  assert choices is not None
339
349
  usage = (
340
350
  chunk["usage"]
341
- if choices[0]["finish_reason"] is not None and reasoning_parser is not None
351
+ if choices[0]["finish_reason"] is not None
352
+ and reasoning_parser
353
+ and reasoning_parser.check_content_parser()
342
354
  else None
343
355
  )
344
356
  chat_chunk = {
@@ -356,28 +368,32 @@ class ChatModelMixin:
356
368
  cls,
357
369
  chunk: CompletionChunk,
358
370
  reasoning_parser: Optional[ReasoningParser] = None,
359
- ) -> ChatCompletionChunk:
360
- choices_list = []
371
+ ) -> List[ChatCompletionChunk]:
372
+ choices_list: List[ChatCompletionChunkChoice] = []
373
+ chunks: List[ChatCompletionChunk] = []
361
374
  for i, choice in enumerate(chunk["choices"]):
362
375
  delta = ChatCompletionChunkDelta(role="assistant", content="")
363
- if reasoning_parser is not None:
376
+ if reasoning_parser and reasoning_parser.check_content_parser():
364
377
  delta["content"] = None
365
378
  delta["reasoning_content"] = ""
366
379
  choices_list.append(
367
- {
368
- "index": i,
369
- "delta": delta,
370
- "finish_reason": None,
371
- }
380
+ ChatCompletionChunkChoice(
381
+ index=i,
382
+ delta=delta,
383
+ finish_reason=None,
384
+ )
372
385
  )
373
- chat_chunk = {
374
- "id": "chat" + chunk["id"],
375
- "model": chunk["model"],
376
- "created": chunk["created"],
377
- "object": "chat.completion.chunk",
378
- "choices": choices_list,
379
- }
380
- return cast(ChatCompletionChunk, chat_chunk)
386
+ chat_chunk = ChatCompletionChunk(
387
+ id="chat" + chunk["id"],
388
+ model=chunk["model"],
389
+ created=chunk["created"],
390
+ object="chat.completion.chunk",
391
+ choices=choices_list,
392
+ )
393
+ chunks.append(chat_chunk)
394
+ if reasoning_parser:
395
+ chunks.extend(reasoning_parser.prepare_first_reasoning_content_chunk(chunk))
396
+ return chunks
381
397
 
382
398
  @classmethod
383
399
  def _get_final_chat_completion_chunk(
@@ -402,6 +418,8 @@ class ChatModelMixin:
402
418
  reasoning_parse: Optional[ReasoningParser] = None,
403
419
  ) -> Iterator[ChatCompletionChunk]:
404
420
  previous_texts = [""]
421
+ if reasoning_parse:
422
+ chunks = reasoning_parse.prepare_reasoning_content_sync(chunks)
405
423
  for _, chunk in enumerate(chunks):
406
424
  # usage
407
425
  choices = chunk.get("choices")
@@ -449,6 +467,9 @@ class ChatModelMixin:
449
467
  reasoning_parser: Optional[ReasoningParser] = None,
450
468
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
451
469
  previous_texts = [""]
470
+ # Process chunks
471
+ if reasoning_parser:
472
+ chunks = reasoning_parser.prepare_reasoning_content_streaming(chunks)
452
473
  async for chunk in chunks:
453
474
  choices = chunk.get("choices")
454
475
  if not choices:
@@ -464,19 +485,25 @@ class ChatModelMixin:
464
485
  def _to_chat_completion(
465
486
  completion: Completion, reasoning_parser: Optional[ReasoningParser] = None
466
487
  ) -> ChatCompletion:
488
+ # prepare reasoning content
489
+ if reasoning_parser:
490
+ completion = reasoning_parser.prepare_reasoning_content(completion)
491
+
467
492
  if completion.get("object") == "chat.completion" and completion.get("choices"):
468
493
  # Already a ChatCompletion
469
- if reasoning_parser is not None:
470
- for choice in completion["choices"]:
471
- message = choice["message"] # type: ignore
472
- text = message["content"]
494
+ for choice in completion["choices"]:
495
+ message = choice["message"] # type: ignore
496
+ text = message["content"] # Original content from the message
497
+
498
+ if reasoning_parser and reasoning_parser.check_content_parser():
499
+ # Parse into reasoning and content parts
473
500
  (
474
- reasoning_content,
475
- content,
501
+ reasoning_val,
502
+ content_val,
476
503
  ) = reasoning_parser.extract_reasoning_content(text)
477
- message["content"] = content
478
- if reasoning_content is not None:
479
- message["reasoning_content"] = reasoning_content
504
+ message["content"] = content_val
505
+ if reasoning_val is not None:
506
+ message["reasoning_content"] = reasoning_val
480
507
  return cast(ChatCompletion, completion)
481
508
 
482
509
  choices = []
@@ -484,7 +511,7 @@ class ChatModelMixin:
484
511
  content = choice["text"]
485
512
  reasoning_content = None
486
513
 
487
- if reasoning_parser is not None:
514
+ if reasoning_parser and reasoning_parser.check_content_parser():
488
515
  reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
489
516
  choice
490
517
  )
@@ -681,20 +708,12 @@ class ChatModelMixin:
681
708
  failed_contents.append(content)
682
709
  finish_reason = "tool_calls" if tool_calls else "stop"
683
710
 
684
- reasoning_content = None
685
711
  content = ". ".join(failed_contents) if failed_contents else None
686
- if reasoning_parser is not None:
687
- reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
688
- content
689
- )
690
712
  d = {
691
713
  "role": "assistant",
692
714
  "content": content,
693
715
  "tool_calls": tool_calls,
694
716
  }
695
- # add only reasoning_content is None
696
- if reasoning_content is not None:
697
- d["reasoning_content"] = reasoning_content
698
717
 
699
718
  try:
700
719
  usage = c.get("usage")
@@ -729,7 +748,17 @@ class ChatModelMixin:
729
748
  c,
730
749
  reasoning_parser: Optional[ReasoningParser] = None,
731
750
  ):
751
+ if reasoning_parser:
752
+ c = reasoning_parser.prepare_reasoning_content(c)
732
753
  _id = str(uuid.uuid4())
754
+ reasoning_content = None
755
+ if reasoning_parser and reasoning_parser.check_content_parser():
756
+ text = c["choices"][0]["text"]
757
+ reasoning_content, content = reasoning_parser.extract_reasoning_content(
758
+ text
759
+ )
760
+ c["choices"][0]["text"] = content
761
+
733
762
  tool_result = cls._eval_tool_arguments(model_family, c)
734
763
 
735
764
  tool_calls = []
@@ -750,12 +779,6 @@ class ChatModelMixin:
750
779
  failed_contents.append(content)
751
780
  finish_reason = "tool_calls" if tool_calls else "stop"
752
781
 
753
- reasoning_content = None
754
- content = ". ".join(failed_contents) if failed_contents else None
755
- if reasoning_parser is not None:
756
- reasoning_content, content = reasoning_parser.extract_reasoning_content( # type: ignore
757
- content
758
- )
759
782
  m = {
760
783
  "role": "assistant",
761
784
  "content": content,
@@ -170,6 +170,7 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
170
170
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
171
171
  VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
172
172
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
173
+ VLLM_SUPPORTED_CHAT_MODELS.append("XiYanSQL-QwenCoder-2504")
173
174
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B-Preview")
174
175
  VLLM_SUPPORTED_CHAT_MODELS.append("QwQ-32B")
175
176
  VLLM_SUPPORTED_CHAT_MODELS.append("marco-o1")
@@ -177,6 +178,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
177
178
  VLLM_SUPPORTED_CHAT_MODELS.append("fin-r1")
178
179
  VLLM_SUPPORTED_CHAT_MODELS.append("seallms-v3")
179
180
  VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1-preview")
181
+ VLLM_SUPPORTED_CHAT_MODELS.append("skywork-or1")
182
+ VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-Qwen2.5")
183
+ VLLM_SUPPORTED_CHAT_MODELS.append("DianJin-R1")
180
184
 
181
185
  if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
182
186
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -207,6 +211,7 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
207
211
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
208
212
  VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.3-instruct")
209
213
  VLLM_SUPPORTED_CHAT_MODELS.append("deepseek-r1-distill-llama")
214
+ VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
210
215
 
211
216
  if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
212
217
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
@@ -347,8 +352,10 @@ class VLLMModel(LLM):
347
352
  self._device_count = self._get_cuda_count()
348
353
  self._model_config = self._sanitize_model_config(self._model_config)
349
354
  reasoning_content = self._model_config.pop("reasoning_content")
350
-
351
- self.prepare_parse_reasoning_content(reasoning_content)
355
+ enable_thinking = self._model_config.pop("enable_thinking", False)
356
+ self.prepare_parse_reasoning_content(
357
+ reasoning_content, enable_thinking=enable_thinking
358
+ )
352
359
 
353
360
  if (
354
361
  isinstance(self.model_spec, LlamaCppLLMSpecV1)
@@ -811,10 +818,6 @@ class VLLMModel(LLM):
811
818
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
812
819
 
813
820
  sanitized_generate_config = self._sanitize_generate_config(generate_config)
814
- if self.reasoning_parser:
815
- # For reasoning model, the </think> we be split into multiple words,
816
- # if `stop` param is passed, so we pop it from config.
817
- sanitized_generate_config.pop("stop")
818
821
  logger.debug(
819
822
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
820
823
  )
@@ -1029,13 +1032,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1029
1032
  ) -> Dict:
1030
1033
  if not generate_config:
1031
1034
  generate_config = {}
1032
- if not generate_config.get("stop") and self.model_family.stop:
1033
- generate_config["stop"] = self.model_family.stop.copy()
1034
- if (
1035
- not generate_config.get("stop_token_ids")
1036
- and self.model_family.stop_token_ids
1037
- ):
1038
- generate_config["stop_token_ids"] = self.model_family.stop_token_ids.copy()
1035
+ if "reasoning" in getattr(self.model_family, "model_ability", []):
1036
+ generate_config.pop("stop", None)
1037
+ generate_config.pop("stop_token_ids", None)
1038
+ else:
1039
+ if not generate_config.get("stop") and self.model_family.stop:
1040
+ generate_config["stop"] = self.model_family.stop.copy()
1041
+ if (
1042
+ not generate_config.get("stop_token_ids")
1043
+ and self.model_family.stop_token_ids
1044
+ ):
1045
+ generate_config[
1046
+ "stop_token_ids"
1047
+ ] = self.model_family.stop_token_ids.copy()
1039
1048
  return generate_config
1040
1049
 
1041
1050
  @staticmethod
@@ -1047,11 +1056,15 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1047
1056
  chunks: AsyncGenerator[CompletionChunk, None],
1048
1057
  ) -> AsyncGenerator[ChatCompletionChunk, None]:
1049
1058
  i = 0
1059
+ previous_texts = [""]
1060
+ if self.reasoning_parser:
1061
+ chunks = self.reasoning_parser.prepare_reasoning_content(chunks)
1050
1062
  async for chunk in chunks:
1051
1063
  if i == 0:
1052
- yield self._get_first_chat_completion_chunk(
1064
+ for first_chunk in self._get_first_chat_completion_chunk(
1053
1065
  chunk, self.reasoning_parser
1054
- )
1066
+ ):
1067
+ yield first_chunk
1055
1068
  # usage
1056
1069
  choices = chunk.get("choices")
1057
1070
  if not choices:
@@ -1065,7 +1078,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1065
1078
  reasoning_parser=self.reasoning_parser,
1066
1079
  )
1067
1080
  else:
1068
- yield self._to_chat_completion_chunk(chunk, self.reasoning_parser)
1081
+ yield self._to_chat_completion_chunk(
1082
+ chunk, self.reasoning_parser, previous_texts
1083
+ )
1069
1084
  i += 1
1070
1085
 
1071
1086
  @vllm_check
@@ -1078,7 +1093,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1078
1093
  tools = generate_config.pop("tools", []) if generate_config else None
1079
1094
  model_family = self.model_family.model_family or self.model_family.model_name
1080
1095
  full_context_kwargs = (
1081
- self._get_chat_template_kwargs_from_generate_config(generate_config) or {}
1096
+ self._get_chat_template_kwargs_from_generate_config(
1097
+ generate_config, self.reasoning_parser
1098
+ )
1099
+ or {}
1082
1100
  )
1083
1101
  if tools:
1084
1102
  if (
@@ -1198,7 +1216,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1198
1216
  from qwen_vl_utils import process_vision_info
1199
1217
 
1200
1218
  full_context_kwargs = (
1201
- self._get_chat_template_kwargs_from_generate_config(generate_config)
1219
+ self._get_chat_template_kwargs_from_generate_config(
1220
+ generate_config, self.reasoning_parser
1221
+ )
1202
1222
  or {}
1203
1223
  )
1204
1224
  if tools and model_family in QWEN_TOOL_CALL_FAMILY:
@@ -11,8 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import os
15
- import sys
16
14
 
17
15
  import pytest
18
16
  import xoscar as xo
@@ -30,14 +28,7 @@ class ExtendedBlockTracker(VLLMBlockTracker):
30
28
 
31
29
  @pytest.fixture
32
30
  async def actor_pool_context():
33
- start_method = (
34
- os.environ.get("POOL_START_METHOD", "forkserver")
35
- if sys.platform != "win32"
36
- else None
37
- )
38
- pool = await xo.create_actor_pool(
39
- "127.0.0.1", n_process=2, subprocess_start_method=start_method
40
- )
31
+ pool = await xo.create_actor_pool("127.0.0.1", n_process=2)
41
32
  async with pool:
42
33
  yield pool
43
34
 
@@ -56,29 +56,8 @@ def register_custom_model():
56
56
 
57
57
 
58
58
  def _install():
59
- _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
60
- _model_spec_modelscope_json = os.path.join(
61
- os.path.dirname(__file__), "model_spec_modelscope.json"
62
- )
63
- BUILTIN_RERANK_MODELS.update(
64
- dict(
65
- (spec["model_name"], RerankModelSpec(**spec))
66
- for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
67
- )
68
- )
69
- for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
70
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
71
-
72
- MODELSCOPE_RERANK_MODELS.update(
73
- dict(
74
- (spec["model_name"], RerankModelSpec(**spec))
75
- for spec in json.load(
76
- codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
77
- )
78
- )
79
- )
80
- for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
81
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
59
+ load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
60
+ load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_RERANK_MODELS)
82
61
 
83
62
  # register model description after recording model revision
84
63
  for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
@@ -94,5 +73,15 @@ def _install():
94
73
  for ud_rerank in get_user_defined_reranks():
95
74
  RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
96
75
 
76
+
77
+ def load_model_family_from_json(json_filename, target_families):
78
+ _model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
79
+ target_families.update(
80
+ dict(
81
+ (spec["model_name"], RerankModelSpec(**spec))
82
+ for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
83
+ )
84
+ )
85
+ for model_name, model_spec in target_families.items():
86
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
97
87
  del _model_spec_json
98
- del _model_spec_modelscope_json
@@ -30,29 +30,8 @@ from .core import (
30
30
 
31
31
 
32
32
  def _install():
33
- _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
34
- _model_spec_modelscope_json = os.path.join(
35
- os.path.dirname(__file__), "model_spec_modelscope.json"
36
- )
37
- BUILTIN_VIDEO_MODELS.update(
38
- dict(
39
- (spec["model_name"], VideoModelFamilyV1(**spec))
40
- for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
41
- )
42
- )
43
- for model_name, model_spec in BUILTIN_VIDEO_MODELS.items():
44
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
45
-
46
- MODELSCOPE_VIDEO_MODELS.update(
47
- dict(
48
- (spec["model_name"], VideoModelFamilyV1(**spec))
49
- for spec in json.load(
50
- codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
51
- )
52
- )
53
- )
54
- for model_name, model_spec in MODELSCOPE_VIDEO_MODELS.items():
55
- MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
33
+ load_model_family_from_json("model_spec.json", BUILTIN_VIDEO_MODELS)
34
+ load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_VIDEO_MODELS)
56
35
 
57
36
  # register model description
58
37
  for model_name, model_spec in chain(
@@ -60,5 +39,16 @@ def _install():
60
39
  ):
61
40
  VIDEO_MODEL_DESCRIPTIONS.update(generate_video_description(model_spec))
62
41
 
63
- del _model_spec_json
64
- del _model_spec_modelscope_json
42
+
43
+ def load_model_family_from_json(json_filename, target_families):
44
+ json_path = os.path.join(os.path.dirname(__file__), json_filename)
45
+ target_families.update(
46
+ dict(
47
+ (spec["model_name"], VideoModelFamilyV1(**spec))
48
+ for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
49
+ )
50
+ )
51
+ for model_name, model_spec in target_families.items():
52
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
53
+
54
+ del json_path
@@ -19,7 +19,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
19
19
  from ...constants import XINFERENCE_CACHE_DIR
20
20
  from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
21
21
  from ..utils import valid_model_revision
22
- from .diffusers import DiffUsersVideoModel
22
+ from .diffusers import DiffusersVideoModel
23
23
 
24
24
  logger = logging.getLogger(__name__)
25
25
 
@@ -169,13 +169,13 @@ def create_video_model_instance(
169
169
  ] = None,
170
170
  model_path: Optional[str] = None,
171
171
  **kwargs,
172
- ) -> Tuple[DiffUsersVideoModel, VideoModelDescription]:
172
+ ) -> Tuple[DiffusersVideoModel, VideoModelDescription]:
173
173
  model_spec = match_diffusion(model_name, download_hub)
174
174
  if not model_path:
175
175
  model_path = cache(model_spec)
176
176
  assert model_path is not None
177
177
 
178
- model = DiffUsersVideoModel(
178
+ model = DiffusersVideoModel(
179
179
  model_uid,
180
180
  model_path,
181
181
  model_spec,