xinference 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +3 -4
  3. xinference/client/__init__.py +2 -0
  4. xinference/client/common.py +49 -2
  5. xinference/client/handlers.py +18 -0
  6. xinference/client/restful/async_restful_client.py +1760 -0
  7. xinference/client/restful/restful_client.py +74 -78
  8. xinference/core/media_interface.py +3 -1
  9. xinference/core/model.py +5 -4
  10. xinference/core/supervisor.py +10 -5
  11. xinference/core/worker.py +15 -14
  12. xinference/deploy/local.py +51 -9
  13. xinference/deploy/worker.py +5 -3
  14. xinference/device_utils.py +22 -3
  15. xinference/model/audio/fish_speech.py +23 -34
  16. xinference/model/audio/model_spec.json +4 -2
  17. xinference/model/audio/model_spec_modelscope.json +4 -2
  18. xinference/model/audio/utils.py +2 -2
  19. xinference/model/core.py +1 -0
  20. xinference/model/embedding/__init__.py +8 -8
  21. xinference/model/embedding/custom.py +6 -1
  22. xinference/model/embedding/embed_family.py +0 -41
  23. xinference/model/embedding/model_spec.json +10 -1
  24. xinference/model/embedding/model_spec_modelscope.json +10 -1
  25. xinference/model/embedding/sentence_transformers/core.py +30 -15
  26. xinference/model/flexible/core.py +1 -1
  27. xinference/model/flexible/launchers/__init__.py +2 -0
  28. xinference/model/flexible/launchers/image_process_launcher.py +1 -1
  29. xinference/model/flexible/launchers/modelscope_launcher.py +47 -0
  30. xinference/model/flexible/launchers/transformers_launcher.py +5 -5
  31. xinference/model/flexible/launchers/yolo_launcher.py +62 -0
  32. xinference/model/llm/__init__.py +7 -0
  33. xinference/model/llm/core.py +18 -1
  34. xinference/model/llm/llama_cpp/core.py +1 -1
  35. xinference/model/llm/llm_family.json +43 -3
  36. xinference/model/llm/llm_family.py +6 -0
  37. xinference/model/llm/llm_family_modelscope.json +45 -3
  38. xinference/model/llm/mlx/core.py +271 -18
  39. xinference/model/llm/mlx/distributed_models/__init__.py +13 -0
  40. xinference/model/llm/mlx/distributed_models/core.py +164 -0
  41. xinference/model/llm/mlx/distributed_models/deepseek_v3.py +75 -0
  42. xinference/model/llm/mlx/distributed_models/qwen2.py +82 -0
  43. xinference/model/llm/mlx/distributed_models/qwen3.py +82 -0
  44. xinference/model/llm/mlx/distributed_models/qwen3_moe.py +76 -0
  45. xinference/model/llm/reasoning_parser.py +12 -6
  46. xinference/model/llm/sglang/core.py +8 -4
  47. xinference/model/llm/transformers/chatglm.py +4 -1
  48. xinference/model/llm/transformers/core.py +4 -2
  49. xinference/model/llm/transformers/multimodal/cogagent.py +10 -4
  50. xinference/model/llm/transformers/multimodal/intern_vl.py +1 -1
  51. xinference/model/llm/utils.py +36 -17
  52. xinference/model/llm/vllm/core.py +142 -34
  53. xinference/model/llm/vllm/distributed_executor.py +96 -21
  54. xinference/model/llm/vllm/xavier/transfer.py +2 -2
  55. xinference/model/rerank/core.py +26 -9
  56. xinference/model/rerank/model_spec.json +3 -3
  57. xinference/model/rerank/model_spec_modelscope.json +3 -3
  58. xinference/web/ui/build/asset-manifest.json +3 -3
  59. xinference/web/ui/build/index.html +1 -1
  60. xinference/web/ui/build/static/js/main.9b12b7f9.js +3 -0
  61. xinference/web/ui/build/static/js/main.9b12b7f9.js.map +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/0fd4820d93f99509e80d8702dc3f6f8272424acab5608fa7c0e82cb1d3250a87.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/f75545479c17fdfe2a00235fa4a0e9da1ae95e6b3caafba87ded92de6b0240e4.json +1 -0
  67. xinference/web/ui/src/locales/en.json +3 -0
  68. xinference/web/ui/src/locales/ja.json +3 -0
  69. xinference/web/ui/src/locales/ko.json +3 -0
  70. xinference/web/ui/src/locales/zh.json +3 -0
  71. {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/METADATA +4 -3
  72. {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/RECORD +77 -67
  73. xinference/web/ui/build/static/js/main.8a9e3ba0.js +0 -3
  74. xinference/web/ui/build/static/js/main.8a9e3ba0.js.map +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/26b8c9f34b0bed789b3a833767672e39302d1e0c09b4276f4d58d1df7b6bd93b.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/34cfbfb7836e136ba3261cfd411cc554bf99ba24b35dcceebeaa4f008cb3c9dc.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/c5c7c2cd1b863ce41adff2c4737bba06eef3a1acf28288cb83d992060f6b8923.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/cc97b49285d7717c63374766c789141a4329a04582ab32756d7e0e614d4c5c7f.json +0 -1
  79. /xinference/web/ui/build/static/js/{main.8a9e3ba0.js.LICENSE.txt → main.9b12b7f9.js.LICENSE.txt} +0 -0
  80. {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/WHEEL +0 -0
  81. {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/entry_points.txt +0 -0
  82. {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/licenses/LICENSE +0 -0
  83. {xinference-1.7.0.dist-info → xinference-1.7.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,82 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import Any, Optional
17
+
18
+ import mlx.core as mx
19
+ import mlx.nn as nn
20
+ from mlx_lm.models.base import create_attention_mask
21
+ from mlx_lm.models.qwen2 import Model as _Model
22
+ from mlx_lm.models.qwen2 import ModelArgs
23
+ from mlx_lm.models.qwen2 import Qwen2Model as _Qwen2Model
24
+
25
+ from .core import DistributedModelMixin
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class Qwen2Model(_Qwen2Model, DistributedModelMixin):
31
+ def __init__(self, *args, **kwargs):
32
+ _Qwen2Model.__init__(self, *args, **kwargs)
33
+ DistributedModelMixin.__init__(self)
34
+
35
+ def __call__(
36
+ self,
37
+ x: mx.array,
38
+ mask: Optional[mx.array] = None,
39
+ cache: Optional[Any] = None,
40
+ input_embeddings: Optional[mx.array] = None,
41
+ ) -> mx.array:
42
+ if input_embeddings is not None:
43
+ h = input_embeddings
44
+ else:
45
+ h = self.embed_tokens(x)
46
+
47
+ pipeline_rank = self.rank
48
+ pipeline_size = self.world_size
49
+ if mask is None:
50
+ mask = create_attention_mask(h, cache)
51
+
52
+ if cache is None:
53
+ cache = [None] * self.num_layers
54
+
55
+ # Receive from the previous process in the pipeline
56
+
57
+ if pipeline_rank < pipeline_size - 1:
58
+ # wait for previous result
59
+ h = self._wait_prev_stage_result()
60
+
61
+ for i in range(self.num_layers):
62
+ h = self.layers[self.start_idx + i](h, mask, cache[i])
63
+ mx.eval(h)
64
+
65
+ # Send to the next process in the pipeline
66
+ if pipeline_rank != 0:
67
+ self._send_stage_result(h)
68
+ h = self._get_result()
69
+ else:
70
+ self._broadcast_result(h)
71
+
72
+ return self.norm(h)
73
+
74
+
75
+ class Model(_Model):
76
+ def __init__(self, args: ModelArgs):
77
+ nn.Module.__init__(self)
78
+ self.args = args
79
+ self.model_type = args.model_type
80
+ self.model = Qwen2Model(args)
81
+ if not args.tie_word_embeddings:
82
+ self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
@@ -0,0 +1,82 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ from typing import Any, Optional
17
+
18
+ import mlx.core as mx
19
+ import mlx.nn as nn
20
+ from mlx_lm.models.base import create_attention_mask
21
+ from mlx_lm.models.qwen3 import Model as _Model
22
+ from mlx_lm.models.qwen3 import ModelArgs
23
+ from mlx_lm.models.qwen3 import Qwen3Model as _Qwen3Model
24
+
25
+ from .core import DistributedModelMixin
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class Qwen3Model(_Qwen3Model, DistributedModelMixin):
31
+ def __init__(self, *args, **kwargs):
32
+ _Qwen3Model.__init__(self, *args, **kwargs)
33
+ DistributedModelMixin.__init__(self)
34
+
35
+ def __call__(
36
+ self,
37
+ x: mx.array,
38
+ mask: Optional[mx.array] = None,
39
+ cache: Optional[Any] = None,
40
+ input_embeddings: Optional[mx.array] = None,
41
+ ) -> mx.array:
42
+ if input_embeddings is not None:
43
+ h = input_embeddings
44
+ else:
45
+ h = self.embed_tokens(x)
46
+
47
+ pipeline_rank = self.rank
48
+ pipeline_size = self.world_size
49
+ if mask is None:
50
+ mask = create_attention_mask(h, cache)
51
+
52
+ if cache is None:
53
+ cache = [None] * self.num_layers
54
+
55
+ # Receive from the previous process in the pipeline
56
+
57
+ if pipeline_rank < pipeline_size - 1:
58
+ # wait for previous result
59
+ h = self._wait_prev_stage_result()
60
+
61
+ for i in range(self.num_layers):
62
+ h = self.layers[self.start_idx + i](h, mask, cache[i])
63
+ mx.eval(h)
64
+
65
+ # Send to the next process in the pipeline
66
+ if pipeline_rank != 0:
67
+ self._send_stage_result(h)
68
+ h = self._get_result()
69
+ else:
70
+ self._broadcast_result(h)
71
+
72
+ return self.norm(h)
73
+
74
+
75
+ class Model(_Model):
76
+ def __init__(self, args: ModelArgs):
77
+ nn.Module.__init__(self)
78
+ self.args = args
79
+ self.model_type = args.model_type
80
+ self.model = Qwen3Model(args)
81
+ if not args.tie_word_embeddings:
82
+ self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
@@ -0,0 +1,76 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+
17
+ import mlx.core as mx
18
+ import mlx.nn as nn
19
+ from mlx_lm.models.base import create_attention_mask
20
+ from mlx_lm.models.qwen3_moe import Model as _Model
21
+ from mlx_lm.models.qwen3_moe import ModelArgs
22
+ from mlx_lm.models.qwen3_moe import Qwen3MoeModel as _Qwen3MoeModel
23
+
24
+ from .core import DistributedModelMixin
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class Qwen3MoeModel(_Qwen3MoeModel, DistributedModelMixin):
30
+ def __init__(self, *args, **kwargs):
31
+ _Qwen3MoeModel.__init__(self, *args, **kwargs)
32
+ DistributedModelMixin.__init__(self)
33
+
34
+ def __call__(
35
+ self,
36
+ inputs: mx.array,
37
+ mask: mx.array = None,
38
+ cache=None,
39
+ ):
40
+ h = self.embed_tokens(inputs)
41
+
42
+ pipeline_rank = self.rank
43
+ pipeline_size = self.world_size
44
+ if mask is None:
45
+ mask = create_attention_mask(h, cache)
46
+
47
+ if cache is None:
48
+ cache = [None] * self.num_layers
49
+
50
+ # Receive from the previous process in the pipeline
51
+
52
+ if pipeline_rank < pipeline_size - 1:
53
+ # wait for previous result
54
+ h = self._wait_prev_stage_result()
55
+
56
+ for i in range(self.num_layers):
57
+ h = self.layers[self.start_idx + i](h, mask, cache[i])
58
+ mx.eval(h)
59
+
60
+ # Send to the next process in the pipeline
61
+ if pipeline_rank != 0:
62
+ self._send_stage_result(h)
63
+ h = self._get_result()
64
+ else:
65
+ self._broadcast_result(h)
66
+
67
+ return self.norm(h)
68
+
69
+
70
+ class Model(_Model):
71
+ def __init__(self, args: ModelArgs):
72
+ nn.Module.__init__(self)
73
+ self.args = args
74
+ self.model_type = args.model_type
75
+ self.model = Qwen3MoeModel(args)
76
+ self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
@@ -222,6 +222,12 @@ class ReasoningParser:
222
222
  ],
223
223
  )
224
224
 
225
+ def is_enable_thinking(self):
226
+ from .core import chat_context_var
227
+
228
+ context = chat_context_var.get({})
229
+ return context.get("enable_thinking", self.enable_thinking)
230
+
225
231
  async def prepare_reasoning_content_streaming(
226
232
  self, chunks: AsyncGenerator[CompletionChunk, None]
227
233
  ):
@@ -237,7 +243,7 @@ class ReasoningParser:
237
243
 
238
244
  # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
239
245
  # yield chunks as is
240
- if not self.reasoning_start_tag or not self.enable_thinking:
246
+ if not self.reasoning_start_tag or not self.is_enable_thinking():
241
247
  async for chunk in chunks:
242
248
  yield chunk
243
249
  return
@@ -266,7 +272,7 @@ class ReasoningParser:
266
272
  continue
267
273
  assert isinstance(delta, dict)
268
274
  text = delta.get("content")
269
- if text is None:
275
+ if not text:
270
276
  continue
271
277
  # If the first chunk doesn't contain the reasoning_start_tag
272
278
  if self.reasoning_start_tag not in text:
@@ -277,7 +283,7 @@ class ReasoningParser:
277
283
  else:
278
284
  # For standard completion chunks
279
285
  text = choices[0].get("text")
280
- if text is None:
286
+ if not text:
281
287
  continue
282
288
  # If the first chunk doesn't contain the reasoning_start_tag
283
289
  if self.reasoning_start_tag not in text:
@@ -304,7 +310,7 @@ class ReasoningParser:
304
310
  """
305
311
  # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
306
312
  # yield chunks as is
307
- if not self.reasoning_start_tag or not self.enable_thinking:
313
+ if not self.reasoning_start_tag or not self.is_enable_thinking():
308
314
  for chunk in chunks:
309
315
  yield chunk
310
316
  return
@@ -365,7 +371,7 @@ class ReasoningParser:
365
371
  completion: The completion object containing model output,
366
372
  which can be either a chat completion or a standard completion.
367
373
  """
368
- if not self.reasoning_start_tag or not self.enable_thinking:
374
+ if not self.reasoning_start_tag or not self.is_enable_thinking():
369
375
  return completion
370
376
 
371
377
  if completion.get("object") == "chat.completion" and completion.get("choices"):
@@ -399,7 +405,7 @@ class ReasoningParser:
399
405
  or an empty list if no modification is needed
400
406
  """
401
407
  chunks: List[ChatCompletionChunk] = []
402
- if not self.reasoning_start_tag or not self.enable_thinking:
408
+ if not self.reasoning_start_tag or not self.is_enable_thinking():
403
409
  return chunks
404
410
 
405
411
  choices = chunk.get("choices")
@@ -33,6 +33,7 @@ from ....types import (
33
33
  CompletionUsage,
34
34
  )
35
35
  from .. import LLM, LLMFamilyV1, LLMSpecV1
36
+ from ..core import chat_context_var
36
37
  from ..llm_family import CustomLLMFamilyV1
37
38
  from ..utils import ChatModelMixin, generate_completion_chunk
38
39
 
@@ -582,16 +583,17 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
582
583
  request_id: Optional[str] = None,
583
584
  ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
584
585
  assert self.model_family.chat_template is not None
585
- full_context_kwargs = (
586
+ chat_template_kwargs = (
586
587
  self._get_chat_template_kwargs_from_generate_config(
587
588
  generate_config, self.reasoning_parser
588
589
  )
589
590
  or {}
590
591
  )
592
+ chat_context_var.set(chat_template_kwargs)
593
+ full_context_kwargs = chat_template_kwargs.copy()
591
594
  full_prompt = self.get_full_context(
592
595
  messages, self.model_family.chat_template, **full_context_kwargs
593
596
  )
594
-
595
597
  generate_config = self._sanitize_chat_config(generate_config)
596
598
  stream = generate_config.get("stream", None)
597
599
  if stream:
@@ -656,14 +658,16 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
656
658
  chat_template: str = (
657
659
  self.model_family.chat_template if self.model_family.chat_template else ""
658
660
  )
659
-
660
- full_context_kwargs = (
661
+ chat_template_kwargs = (
661
662
  self._get_chat_template_kwargs_from_generate_config(
662
663
  generate_config, self.reasoning_parser
663
664
  )
664
665
  or {}
665
666
  )
667
+ chat_context_var.set(chat_template_kwargs)
668
+ full_context_kwargs = chat_template_kwargs.copy()
666
669
  prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
670
+
667
671
  images, video_inputs = process_vision_info(messages)
668
672
  if video_inputs:
669
673
  raise ValueError("Not support video input now.")
@@ -22,6 +22,7 @@ import torch
22
22
 
23
23
  from ....core.scheduler import InferenceRequest
24
24
  from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
25
+ from ..core import chat_context_var
25
26
  from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
26
27
  from ..utils import (
27
28
  GLM4_TOOL_CALL_FAMILY,
@@ -464,12 +465,14 @@ class ChatglmPytorchChatModel(PytorchChatModel):
464
465
  tools = list(tools) if tools is not None else None
465
466
  tool_choice = r.generate_config.get("tool_choice", "none")
466
467
 
467
- full_context_kwargs = (
468
+ chat_template_kwargs = (
468
469
  self._get_chat_template_kwargs_from_generate_config(
469
470
  r.generate_config, self.reasoning_parser
470
471
  )
471
472
  or {}
472
473
  )
474
+ chat_context_var.set(chat_template_kwargs)
475
+ full_context_kwargs = chat_template_kwargs.copy()
473
476
  r.prompt = self._process_messages(
474
477
  r.prompt, tools=tools, tool_choice=tool_choice
475
478
  )
@@ -37,7 +37,7 @@ from ....types import (
37
37
  PytorchModelConfig,
38
38
  )
39
39
  from ...utils import select_device
40
- from ..core import LLM
40
+ from ..core import LLM, chat_context_var
41
41
  from ..llm_family import LLMFamilyV1, LLMSpecV1
42
42
  from ..utils import (
43
43
  DEEPSEEK_TOOL_CALL_FAMILY,
@@ -725,12 +725,14 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
725
725
 
726
726
  def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
727
727
  model_family = self.model_family.model_family or self.model_family.model_name
728
- full_context_kwargs = (
728
+ chat_template_kwargs = (
729
729
  self._get_chat_template_kwargs_from_generate_config(
730
730
  generate_config, self.reasoning_parser
731
731
  )
732
732
  or {}
733
733
  )
734
+ chat_context_var.set(chat_template_kwargs)
735
+ full_context_kwargs = chat_template_kwargs.copy()
734
736
  if (
735
737
  tools
736
738
  and model_family in QWEN_TOOL_CALL_FAMILY
@@ -20,6 +20,7 @@ from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union
20
20
  import torch
21
21
 
22
22
  from .....model.utils import select_device
23
+ from ...core import chat_context_var
23
24
  from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
24
25
  from ...utils import _decode_image, parse_messages
25
26
  from ..core import register_non_default_model
@@ -33,8 +34,8 @@ logger = logging.getLogger(__name__)
33
34
  class CogAgentChatModel(PytorchMultiModalModel):
34
35
  def __init__(self, *args, **kws):
35
36
  super().__init__(*args, **kws)
36
- self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac"
37
- self._format: Optional[
37
+ self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac" # type: ignore
38
+ self._format: Optional[ # type: ignore
38
39
  Literal[
39
40
  "(Answer in Action-Operation-Sensitive format.)",
40
41
  "(Answer in Status-Plan-Action-Operation format.)",
@@ -187,9 +188,14 @@ class CogAgentChatModel(PytorchMultiModalModel):
187
188
  "return_tensors": "pt",
188
189
  "return_dict": True,
189
190
  }
190
- full_context_kwargs.update(
191
- self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {} # type: ignore
191
+ chat_template_kwargs = (
192
+ self._get_chat_template_kwargs_from_generate_config(
193
+ generate_config, self.reasoning_parser
194
+ )
195
+ or {}
192
196
  )
197
+ chat_context_var.set(chat_template_kwargs)
198
+ full_context_kwargs.update(chat_template_kwargs)
193
199
  assert self.model_family.chat_template is not None
194
200
  inputs = self.get_full_context(
195
201
  [{"role": "user", "image": image, "content": query}],
@@ -83,7 +83,7 @@ class InternVLChatModel(PytorchMultiModalModel):
83
83
  def load_multimodal_model(self):
84
84
  from transformers import AutoModel
85
85
 
86
- kwargs: Dict[str, Any] = {
86
+ kwargs: Dict[str, Any] = { # type: ignore
87
87
  "torch_dtype": torch.bfloat16,
88
88
  "low_cpu_mem_usage": True,
89
89
  "trust_remote_code": True,
@@ -167,13 +167,7 @@ class ChatModelMixin:
167
167
  generate_config: Optional[Union[dict, Any]],
168
168
  reasoning_parser: Optional[ReasoningParser] = None,
169
169
  ) -> Optional[dict]:
170
- if reasoning_parser and not reasoning_parser.enable_thinking:
171
- # hybrid model like qwen3,
172
- # disabled thinking
173
- return {"enable_thinking": False}
174
- if not generate_config:
175
- return None
176
- if "chat_template_kwargs" in generate_config:
170
+ if generate_config and "chat_template_kwargs" in generate_config:
177
171
  kwargs = generate_config["chat_template_kwargs"]
178
172
  if isinstance(kwargs, str):
179
173
  try:
@@ -190,6 +184,10 @@ class ChatModelMixin:
190
184
  f"`chat_template_kwargs` but be a JSON parsable str "
191
185
  f"or dict, got: {kwargs}"
192
186
  )
187
+ elif reasoning_parser and not reasoning_parser.enable_thinking:
188
+ # hybrid model like qwen3,
189
+ # disabled thinking
190
+ return {"enable_thinking": False}
193
191
  return None
194
192
 
195
193
  @staticmethod
@@ -220,7 +218,7 @@ class ChatModelMixin:
220
218
  _messages = [x for x in messages] # copy for not modifying the origin messages
221
219
  _messages.append({"role": "assistant", "content": ""})
222
220
 
223
- if model_family == "internvl2":
221
+ if "internvl" in model_family.lower():
224
222
  system_prompt = (
225
223
  messages[0]["content"] if messages[0]["role"] == "system" else ""
226
224
  )
@@ -558,14 +556,24 @@ class ChatModelMixin:
558
556
  @classmethod
559
557
  def _handle_qwen_tool_result(cls, text: str) -> List[Tuple]:
560
558
  text: str = text.strip() # type: ignore
561
- contents: List[str] = text.split(QWEN_TOOL_CALL_SYMBOLS[1])
559
+
560
+ def split_into_blocks(text: str) -> list[str]:
561
+ # Match blocks starting with <think> or <tool_call> and ending with </think> or </tool_call>
562
+ pattern = r"(<(think|tool_call)>.*?</\2>)"
563
+ blocks = re.findall(pattern, text, re.DOTALL)
564
+ return [match[0] for match in blocks]
565
+
566
+ contents = split_into_blocks(text)
562
567
  results: List[Tuple] = []
563
568
  for content in contents:
564
569
  content = content.strip()
565
570
  if content:
566
- pos = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
567
- if pos != -1:
568
- content = content[pos + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
571
+ pos1 = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
572
+ if pos1 != -1:
573
+ content = content[pos1 + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
574
+ pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
575
+ if pos2 != -1:
576
+ content = content[:pos2]
569
577
  content = content.strip()
570
578
  try:
571
579
  res = json.loads(content)
@@ -580,8 +588,12 @@ class ChatModelMixin:
580
588
  return results
581
589
 
582
590
  @classmethod
583
- def _eval_qwen_chat_arguments(cls, c) -> List[Tuple]:
591
+ def _eval_qwen_chat_arguments(
592
+ cls, c, tool_call_text: Optional[str] = None
593
+ ) -> List[Tuple]:
584
594
  text = c["choices"][0]["text"]
595
+ if tool_call_text:
596
+ text = tool_call_text
585
597
  return cls._handle_qwen_tool_result(text)
586
598
 
587
599
  @classmethod
@@ -662,12 +674,14 @@ class ChatModelMixin:
662
674
  return results
663
675
 
664
676
  @classmethod
665
- def _eval_tool_arguments(cls, model_family, c):
677
+ def _eval_tool_arguments(
678
+ cls, model_family, c, tool_call_text: Optional[str] = None
679
+ ):
666
680
  family = model_family.model_family or model_family.model_name
667
681
  if family in GLM4_TOOL_CALL_FAMILY:
668
682
  result = cls._eval_glm_chat_arguments(c)
669
683
  elif family in QWEN_TOOL_CALL_FAMILY:
670
- result = cls._eval_qwen_chat_arguments(c)
684
+ result = cls._eval_qwen_chat_arguments(c, tool_call_text)
671
685
  elif family in LLAMA3_TOOL_CALL_FAMILY:
672
686
  result = cls._eval_llama3_chat_arguments(c)
673
687
  elif family in DEEPSEEK_TOOL_CALL_FAMILY:
@@ -687,15 +701,17 @@ class ChatModelMixin:
687
701
  c,
688
702
  chunk_id=None,
689
703
  reasoning_parser: Optional[ReasoningParser] = None,
704
+ tool_call_text: Optional[str] = None,
690
705
  ):
691
706
  _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
692
- tool_result = cls._eval_tool_arguments(model_family, c)
707
+ tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
693
708
  tool_calls = []
694
709
  failed_contents = []
695
710
  for content, func, args in tool_result:
696
711
  if func:
697
712
  tool_calls.append(
698
713
  {
714
+ "index": 0,
699
715
  "id": f"call_{_id}",
700
716
  "type": "function",
701
717
  "function": {
@@ -782,9 +798,12 @@ class ChatModelMixin:
782
798
  }
783
799
  )
784
800
  else:
785
- failed_contents.append(content)
801
+ if content:
802
+ failed_contents.append(content)
786
803
  finish_reason = "tool_calls" if tool_calls else "stop"
787
804
 
805
+ content = ". ".join(failed_contents) if failed_contents else None
806
+
788
807
  # fix: qwen tool_call content field return null
789
808
  family = model_family.model_family or model_family.model_name
790
809
  if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None: