xinference 1.11.0.post1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (39) hide show
  1. xinference/__init__.py +8 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/oauth2/utils.py +26 -5
  4. xinference/core/model.py +1 -10
  5. xinference/device_utils.py +11 -1
  6. xinference/model/embedding/model_spec.json +70 -0
  7. xinference/model/image/core.py +20 -10
  8. xinference/model/image/model_spec.json +55 -3
  9. xinference/model/image/ocr/__init__.py +5 -0
  10. xinference/model/image/ocr/deepseek_ocr.py +958 -0
  11. xinference/model/llm/core.py +2 -0
  12. xinference/model/llm/llama_cpp/core.py +2 -0
  13. xinference/model/llm/llm_family.json +319 -6
  14. xinference/model/llm/lmdeploy/core.py +2 -0
  15. xinference/model/llm/sglang/core.py +2 -0
  16. xinference/model/llm/transformers/core.py +2 -0
  17. xinference/model/llm/transformers/multimodal/qwen-omni.py +60 -11
  18. xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
  19. xinference/model/llm/vllm/core.py +2 -0
  20. xinference/model/rerank/model_spec.json +368 -252
  21. xinference/model/rerank/sentence_transformers/core.py +10 -2
  22. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +71 -5
  23. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +51 -1
  24. xinference/ui/gradio/media_interface.py +469 -4
  25. xinference/ui/gradio/utils/__init__.py +19 -0
  26. xinference/ui/gradio/utils/latex.py +342 -0
  27. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  28. xinference/ui/web/ui/build/index.html +1 -1
  29. xinference/ui/web/ui/build/static/js/{main.e4d9a9e1.js → main.87d6859b.js} +3 -3
  30. xinference/ui/web/ui/build/static/js/{main.e4d9a9e1.js.map → main.87d6859b.js.map} +1 -1
  31. xinference/ui/web/ui/node_modules/.cache/babel-loader/412a6b414a8267c7a349d9beda4593cdf218abf32edaaf339e6a230df40397b8.json +1 -0
  32. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/METADATA +10 -11
  33. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/RECORD +38 -35
  34. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +0 -1
  35. /xinference/ui/web/ui/build/static/js/{main.e4d9a9e1.js.LICENSE.txt → main.87d6859b.js.LICENSE.txt} +0 -0
  36. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/WHEEL +0 -0
  37. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/entry_points.txt +0 -0
  38. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/licenses/LICENSE +0 -0
  39. {xinference-1.11.0.post1.dist-info → xinference-1.12.0.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,8 @@ def get_llm_version_infos():
45
45
 
46
46
 
47
47
  class LLM(abc.ABC):
48
+ allow_batch = False
49
+
48
50
  def __init__(
49
51
  self,
50
52
  replica_model_uid: str,
@@ -40,6 +40,8 @@ class _Error:
40
40
 
41
41
 
42
42
  class XllamaCppModel(LLM, ChatModelMixin):
43
+ allow_batch = True
44
+
43
45
  def __init__(
44
46
  self,
45
47
  model_uid: str,
@@ -22085,6 +22085,208 @@
22085
22085
  "model_id": "cpatonn-mirror/Qwen3-VL-30B-A3B-Instruct-AWQ-{quantization}"
22086
22086
  }
22087
22087
  }
22088
+ },
22089
+ {
22090
+ "model_format": "pytorch",
22091
+ "model_size_in_billions": 32,
22092
+ "model_src": {
22093
+ "huggingface": {
22094
+ "quantizations": [
22095
+ "none"
22096
+ ],
22097
+ "model_id": "Qwen/Qwen3-VL-32B-Instruct"
22098
+ },
22099
+ "modelscope": {
22100
+ "quantizations": [
22101
+ "none"
22102
+ ],
22103
+ "model_id": "Qwen/Qwen3-VL-32B-Instruct"
22104
+ }
22105
+ }
22106
+ },
22107
+ {
22108
+ "model_format": "fp8",
22109
+ "model_size_in_billions": 32,
22110
+ "model_src": {
22111
+ "huggingface": {
22112
+ "quantizations": [
22113
+ "fp8"
22114
+ ],
22115
+ "model_id": "Qwen/Qwen3-VL-32B-Instruct-FP8"
22116
+ },
22117
+ "modelscope": {
22118
+ "quantizations": [
22119
+ "fp8"
22120
+ ],
22121
+ "model_id": "Qwen/Qwen3-VL-32B-Instruct-FP8"
22122
+ }
22123
+ }
22124
+ },
22125
+ {
22126
+ "model_format": "awq",
22127
+ "model_size_in_billions": 32,
22128
+ "model_src": {
22129
+ "huggingface": {
22130
+ "quantizations": [
22131
+ "Int4"
22132
+ ],
22133
+ "model_id": "QuantTrio/Qwen3-VL-32B-Instruct-AWQ"
22134
+ },
22135
+ "modelscope": {
22136
+ "quantizations": [
22137
+ "Int4"
22138
+ ],
22139
+ "model_id": "tclf90/Qwen3-VL-32B-Instruct-AWQ"
22140
+ }
22141
+ }
22142
+ },
22143
+ {
22144
+ "model_format": "pytorch",
22145
+ "model_size_in_billions": 8,
22146
+ "model_src": {
22147
+ "huggingface": {
22148
+ "quantizations": [
22149
+ "none"
22150
+ ],
22151
+ "model_id": "Qwen/Qwen3-VL-8B-Instruct"
22152
+ },
22153
+ "modelscope": {
22154
+ "quantizations": [
22155
+ "none"
22156
+ ],
22157
+ "model_id": "Qwen/Qwen3-VL-8B-Instruct"
22158
+ }
22159
+ }
22160
+ },
22161
+ {
22162
+ "model_format": "fp8",
22163
+ "model_size_in_billions": 8,
22164
+ "model_src": {
22165
+ "huggingface": {
22166
+ "quantizations": [
22167
+ "fp8"
22168
+ ],
22169
+ "model_id": "Qwen/Qwen3-VL-8B-Instruct-FP8"
22170
+ },
22171
+ "modelscope": {
22172
+ "quantizations": [
22173
+ "fp8"
22174
+ ],
22175
+ "model_id": "Qwen/Qwen3-VL-8B-Instruct-FP8"
22176
+ }
22177
+ }
22178
+ },
22179
+ {
22180
+ "model_format": "awq",
22181
+ "model_size_in_billions": 8,
22182
+ "model_src": {
22183
+ "huggingface": {
22184
+ "quantizations": [
22185
+ "4bit",
22186
+ "8bit"
22187
+ ],
22188
+ "model_id": "cpatonn/Qwen3-VL-8B-Instruct-AWQ-{quantization}"
22189
+ },
22190
+ "modelscope": {
22191
+ "quantizations": [
22192
+ "4bit",
22193
+ "8bit"
22194
+ ],
22195
+ "model_id": "cpatonn-mirror/Qwen3-VL-8B-Instruct-AWQ-{quantization}"
22196
+ }
22197
+ }
22198
+ },
22199
+ {
22200
+ "model_format": "pytorch",
22201
+ "model_size_in_billions": 4,
22202
+ "model_src": {
22203
+ "huggingface": {
22204
+ "quantizations": [
22205
+ "none"
22206
+ ],
22207
+ "model_id": "Qwen/Qwen3-VL-4B-Instruct"
22208
+ },
22209
+ "modelscope": {
22210
+ "quantizations": [
22211
+ "none"
22212
+ ],
22213
+ "model_id": "Qwen/Qwen3-VL-4B-Instruct"
22214
+ }
22215
+ }
22216
+ },
22217
+ {
22218
+ "model_format": "fp8",
22219
+ "model_size_in_billions": 4,
22220
+ "model_src": {
22221
+ "huggingface": {
22222
+ "quantizations": [
22223
+ "fp8"
22224
+ ],
22225
+ "model_id": "Qwen/Qwen3-VL-4B-Instruct-FP8"
22226
+ },
22227
+ "modelscope": {
22228
+ "quantizations": [
22229
+ "fp8"
22230
+ ],
22231
+ "model_id": "Qwen/Qwen3-VL-4B-Instruct-FP8"
22232
+ }
22233
+ }
22234
+ },
22235
+ {
22236
+ "model_format": "awq",
22237
+ "model_size_in_billions": 4,
22238
+ "model_src": {
22239
+ "huggingface": {
22240
+ "quantizations": [
22241
+ "4bit",
22242
+ "8bit"
22243
+ ],
22244
+ "model_id": "cpatonn/Qwen3-VL-4B-Instruct-AWQ-{quantization}"
22245
+ },
22246
+ "modelscope": {
22247
+ "quantizations": [
22248
+ "4bit",
22249
+ "8bit"
22250
+ ],
22251
+ "model_id": "cpatonn-mirror/Qwen3-VL-4B-Instruct-AWQ-{quantization}"
22252
+ }
22253
+ }
22254
+ },
22255
+ {
22256
+ "model_format": "pytorch",
22257
+ "model_size_in_billions": 2,
22258
+ "model_src": {
22259
+ "huggingface": {
22260
+ "quantizations": [
22261
+ "none"
22262
+ ],
22263
+ "model_id": "Qwen/Qwen3-VL-2B-Instruct"
22264
+ },
22265
+ "modelscope": {
22266
+ "quantizations": [
22267
+ "none"
22268
+ ],
22269
+ "model_id": "Qwen/Qwen3-VL-2B-Instruct"
22270
+ }
22271
+ }
22272
+ },
22273
+ {
22274
+ "model_format": "fp8",
22275
+ "model_size_in_billions": 2,
22276
+ "model_src": {
22277
+ "huggingface": {
22278
+ "quantizations": [
22279
+ "fp8"
22280
+ ],
22281
+ "model_id": "Qwen/Qwen3-VL-2B-Instruct-FP8"
22282
+ },
22283
+ "modelscope": {
22284
+ "quantizations": [
22285
+ "fp8"
22286
+ ],
22287
+ "model_id": "Qwen/Qwen3-VL-2B-Instruct-FP8"
22288
+ }
22289
+ }
22088
22290
  }
22089
22291
  ],
22090
22292
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {%- if messages[0].content is string %}\n {{- messages[0].content }}\n {%- else %}\n {%- for content in messages[0].content %}\n {%- if 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].content is string %}\n {{- messages[0].content }}\n {%- else %}\n {%- for content in messages[0].content %}\n {%- if 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n {%- if message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|vision_start|><|image_pad|><|vision_end|>\n {%- elif content.type == 'video' or 'video' in content %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|vision_start|><|video_pad|><|vision_end|>\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content_item in message.content %}\n {%- if 'text' in content_item %}\n {{- content_item.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and message.content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|vision_start|><|image_pad|><|vision_end|>\n {%- elif content.type == 'video' or 'video' in content %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|vision_start|><|video_pad|><|vision_end|>\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -22579,14 +22781,14 @@
22579
22781
  "huggingface": {
22580
22782
  "quantizations": [
22581
22783
  "4bit",
22582
- "8bit"
22784
+ "8bit"
22583
22785
  ],
22584
22786
  "model_id": "cpatonn/Qwen3-Omni-30B-A3B-Thinking-AWQ-{quantization}"
22585
22787
  },
22586
22788
  "modelscope": {
22587
22789
  "quantizations": [
22588
22790
  "4bit",
22589
- "8bit"
22791
+ "8bit"
22590
22792
  ],
22591
22793
  "model_id": "cpatonn-mirror/Qwen3-Omni-30B-A3B-Thinking-AWQ-{quantization}"
22592
22794
  }
@@ -22604,7 +22806,15 @@
22604
22806
  ],
22605
22807
  "reasoning_start_tag": "<think>",
22606
22808
  "reasoning_end_tag": "</think>",
22607
- "tool_parser":"qwen"
22809
+ "tool_parser": "qwen",
22810
+ "virtualenv": {
22811
+ "packages": [
22812
+ "transformers==4.57.1",
22813
+ "#system_numpy#",
22814
+ "qwen_omni_utils",
22815
+ "soundfile"
22816
+ ]
22817
+ }
22608
22818
  },
22609
22819
  {
22610
22820
  "version": 2,
@@ -22650,14 +22860,14 @@
22650
22860
  "huggingface": {
22651
22861
  "quantizations": [
22652
22862
  "4bit",
22653
- "8bit"
22863
+ "8bit"
22654
22864
  ],
22655
22865
  "model_id": "cpatonn/Qwen3-Omni-30B-A3B-Instruct-AWQ-{quantization}"
22656
22866
  },
22657
22867
  "modelscope": {
22658
22868
  "quantizations": [
22659
22869
  "4bit",
22660
- "8bit"
22870
+ "8bit"
22661
22871
  ],
22662
22872
  "model_id": "cpatonn-mirror/Qwen3-Omni-30B-A3B-Instruct-AWQ-{quantization}"
22663
22873
  }
@@ -22673,6 +22883,109 @@
22673
22883
  "<|endoftext|>",
22674
22884
  "<|im_end|>"
22675
22885
  ],
22676
- "tool_parser":"qwen"
22886
+ "tool_parser": "qwen",
22887
+ "virtualenv": {
22888
+ "packages": [
22889
+ "transformers==4.57.1",
22890
+ "#system_numpy#",
22891
+ "qwen_omni_utils",
22892
+ "soundfile"
22893
+ ]
22894
+ }
22895
+ },
22896
+ {
22897
+ "model_name": "MiniMax-M2",
22898
+ "model_description": "MiniMax-M2, a Mini model built for Max coding & agentic workflows.",
22899
+ "context_length": 196608,
22900
+ "model_lang": [
22901
+ "en",
22902
+ "zh"
22903
+ ],
22904
+ "model_ability": [
22905
+ "chat",
22906
+ "tools",
22907
+ "reasoning"
22908
+ ],
22909
+ "model_specs": [
22910
+ {
22911
+ "model_size_in_billions": 230,
22912
+ "activated_size_in_billions": 3,
22913
+ "model_format": "pytorch",
22914
+ "model_src": {
22915
+ "huggingface": {
22916
+ "model_id": "MiniMaxAI/MiniMax-M2",
22917
+ "quantizations": [
22918
+ "none"
22919
+ ]
22920
+ },
22921
+ "modelscope": {
22922
+ "model_id": "MiniMax/MiniMax-M2",
22923
+ "quantizations": [
22924
+ "none"
22925
+ ]
22926
+ }
22927
+ }
22928
+ },
22929
+ {
22930
+ "model_size_in_billions": 230,
22931
+ "activated_size_in_billions": 3,
22932
+ "model_format": "awq",
22933
+ "model_src": {
22934
+ "huggingface": {
22935
+ "model_id": "QuantTrio/MiniMax-M2-AWQ",
22936
+ "quantizations": [
22937
+ "Int4"
22938
+ ]
22939
+ },
22940
+ "modelscope": {
22941
+ "model_id": "tclf90/MiniMax-M2-AWQ",
22942
+ "quantizations": [
22943
+ "Int4"
22944
+ ]
22945
+ }
22946
+ }
22947
+ },
22948
+ {
22949
+ "model_size_in_billions": 230,
22950
+ "activated_size_in_billions": 3,
22951
+ "model_format": "mlx",
22952
+ "model_src": {
22953
+ "huggingface": {
22954
+ "model_id": "mlx-community/MiniMax-M2-{quantization}",
22955
+ "quantizations": [
22956
+ "3bit",
22957
+ "4bit",
22958
+ "5bit",
22959
+ "6bit",
22960
+ "8bit"
22961
+ ]
22962
+ },
22963
+ "modelscope": {
22964
+ "model_id": "mlx-community/MiniMax-M2-{quantization}",
22965
+ "quantizations": [
22966
+ "3bit",
22967
+ "4bit",
22968
+ "5bit",
22969
+ "6bit",
22970
+ "8bit"
22971
+ ]
22972
+ }
22973
+ }
22974
+ }
22975
+ ],
22976
+ "chat_template": "{# ----------‑‑‑ special token variables ‑‑‑---------- #}\n{%- set toolcall_begin_token = '<minimax:tool_call>' -%}\n{%- set toolcall_end_token = '</minimax:tool_call>' -%}\n{#- Tool Rendering Functions ============================================== -#}\n{%- macro render_tool_namespace(namespace_name, tool_list) -%}\n{%- for tool in tool_list -%}\n<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>\n{% endfor -%}\n{%- endmacro -%}\n{%- macro visible_text(content) -%}\n {%- if content is string -%}\n {{ content }}\n {%- elif content is iterable and content is not mapping -%}\n {%- for item in content -%}\n {%- if item is mapping and item.type == 'text' -%}\n {{- item.text }}\n {%- elif item is string -%}\n {{- item }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{- content }}\n {%- endif -%}\n{%- endmacro -%}\n{#- System Message Construction ============================================ -#}\n{%- macro build_system_message(system_message) -%}\n {%- if system_message and system_message.content -%}\n {{- visible_text(system_message.content) }}\n {%- else -%}\n {%- if model_identity is not defined -%}\n {%- set model_identity = \"You are a helpful assistant.\" -%}\n {%- endif -%}\n {{- model_identity }}\n {%- endif -%}\n \n {#- Handle current_date -#}\n {%- if system_message and system_message.current_date -%}\n {{- '\\n' ~ 'Current date: ' + system_message.current_date }}\n {%- endif -%}\n {#- Handle current_location -#}\n {%- if system_message and system_message.current_location -%}\n {{- '\\n' ~ 'Current location: ' + system_message.current_location }}\n {%- endif -%}\n{%- endmacro -%}\n{#- Main Template Logic ================================================= -#}\n{#- Extract system message (only first message if it's system) -#}\n{%- set system_message = none -%}\n{%- set conversation_messages = messages -%}\n{%- if messages and messages[0].role == \"system\" -%}\n {%- set system_message = messages[0] -%}\n {%- set conversation_messages = messages[1:] -%}\n{%- endif -%}\n{#- Get the last user message turn, for interleved thinking -#}\n{%- set ns = namespace(last_user_index=-1) %}\n{% for m in conversation_messages %}\n {%- if m.role == 'user' %}\n {% set ns.last_user_index = loop.index0 -%}\n {%- endif %}\n{%- endfor %}\n{#- Render system message -#}\n{{- ']~!b[' ~ ']~b]system' ~ '\\n' }}\n{{- build_system_message(system_message) }}\n{#- Render tools if available -#}\n{%- if tools -%}\n {{- '\\n\\n' ~ '# Tools' ~ '\\n' ~ 'You may call one or more tools to assist with the user query.\\nHere are the tools available in JSONSchema format:' ~ '\\n' }}\n {{- '\\n' ~ '<tools>' ~ '\\n' }}\n {{- render_tool_namespace(\"functions\", tools) }}\n {{- '</tools>' ~ '\\n\\n' }}\n{{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\\n' }}\n{{- '\\n' ~ toolcall_begin_token }}\n<invoke name=\"tool-name-1\">\n<parameter name=\"param-key-1\">param-value-1</parameter>\n<parameter name=\"param-key-2\">param-value-2</parameter>\n...\n</invoke>\n{{- '\\n' ~ toolcall_end_token }}\n{%- endif -%}\n{{- '[e~[\\n' }}\n\n{#- Render messages -#}\n{%- set last_tool_call = namespace(name=none) -%}\n{%- for message in conversation_messages -%}\n {%- if message.role == 'assistant' -%}\n {#- Only render reasoning_content if no user message follows -#}\n {{- ']~b]ai' ~ '\\n' }}\n\n {%- set reasoning_content = '' %}\n {%- set content = visible_text(message.content) %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].strip('\\n').split('<think>')[-1].strip('\\n') %}\n {%- set content = content.split('</think>')[-1].strip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if reasoning_content and loop.index0 > ns.last_user_index -%}\n {{- '<think>' ~ '\\n' ~ reasoning_content ~ '\\n' ~ '</think>' ~ '\\n\\n' }}\n {%- endif -%}\n {%- if content -%}\n {{- content }}\n {%- endif -%}\n {%- if message.tool_calls -%}\n {{- '\\n' ~ toolcall_begin_token ~ '\\n' }}\n\n {%- for tool_call in message.tool_calls -%}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<invoke name=\"' + tool_call.name + '\">' }}\n {% set _args = tool_call.arguments %}\n {%- for k, v in _args.items() %}\n {{- '<parameter name=\"' + k + '\">' }}\n {{- v | tojson(ensure_ascii=False) if v is not string else v }}\n {{- '</parameter>' }}\n {% endfor %}\n {{- '</invoke>' ~ '\\n' }}\n {%- endfor -%}\n \n {{- toolcall_end_token}}\n {%- set last_tool_call.name = message.tool_calls[-1].name -%}\n {%- else -%}\n {%- set last_tool_call.name = none -%}\n {%- endif -%}\n {{- '[e~[' ~ '\\n' }}\n \n {%- elif message.role == 'tool' -%}\n {%- if last_tool_call.name is none -%}\n {{- raise_exception(\"Message has tool role, but there was no previous assistant message with a tool call!\") }}\n {%- endif -%}\n {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}\n {{- ']~b]tool' }}\n {%- endif -%}\n {%- if message.content is string -%}\n {{- '\\n<response>' }}\n {{- message.content }}\n {{- '</response>' }}\n {%- else -%}\n {%- for tr in message.content -%}\n {{- '\\n<response>' }}\n {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}\n {{- '\\n</response>' }}\n {%- endfor -%}\n {%- endif -%}\n {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}\n {{- '[e~[\\n' -}}\n {%- endif -%}\n \n {%- elif message.role == 'user' -%}\n {{- ']~b]user' ~ '\\n' }}\n {{- visible_text(message.content) }}\n {{- '[e~[' ~ '\\n' }}\n {%- endif -%}\n{%- endfor -%}\n\n{#- Generation prompt -#}\n{%- if add_generation_prompt -%}\n{{- ']~b]ai' ~ '\\n' ~ '<think>' ~ '\\n' }}\n{%- endif -%}",
22977
+ "stop_token_ids": [
22978
+ 200020
22979
+ ],
22980
+ "stop": [
22981
+ "[e~["
22982
+ ],
22983
+ "reasoning_start_tag": "<think>",
22984
+ "reasoning_end_tag": "</think>",
22985
+ "tool_parser": "minimax",
22986
+ "version": 2,
22987
+ "virtualenv": {
22988
+ "packages": []
22989
+ }
22677
22990
  }
22678
22991
  ]
@@ -73,6 +73,8 @@ class LMDeployGenerateConfig(TypedDict, total=False):
73
73
 
74
74
 
75
75
  class LMDeployModel(LLM):
76
+ allow_batch = True
77
+
76
78
  def __init__(
77
79
  self,
78
80
  model_uid: str,
@@ -137,6 +137,8 @@ SGLANG_SUPPORTED_VISION_MODEL_LIST = [
137
137
 
138
138
 
139
139
  class SGLANGModel(LLM):
140
+ allow_batch = True
141
+
140
142
  def __init__(
141
143
  self,
142
144
  model_uid: str,
@@ -91,6 +91,8 @@ def register_non_default_model(*model_names: str):
91
91
 
92
92
 
93
93
  class PytorchModel(LLM):
94
+ allow_batch = True
95
+
94
96
  def __init__(
95
97
  self,
96
98
  model_uid: str,
@@ -19,6 +19,8 @@ import uuid
19
19
  from threading import Thread
20
20
  from typing import Any, Dict, Iterator, List, Optional, Tuple
21
21
 
22
+ import torch
23
+
22
24
  from .....types import (
23
25
  ChatCompletion,
24
26
  ChatCompletionAudio,
@@ -35,12 +37,20 @@ logger = logging.getLogger(__name__)
35
37
 
36
38
  @register_transformer
37
39
  @register_non_default_model("qwen2.5-omni")
38
- class Qwen2_5OmniChatModel(PytorchMultiModalModel):
40
+ @register_non_default_model("Qwen3-Omni-Thinking")
41
+ @register_non_default_model("Qwen3-Omni-Instruct")
42
+ class QwenOmniChatModel(PytorchMultiModalModel):
39
43
  DEFAULT_SYSTEM_PROMPT = (
40
44
  "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
41
45
  "capable of perceiving auditory and visual inputs, as well as generating text and speech."
42
46
  )
43
47
 
48
+ def __init__(self, *args, **kwargs):
49
+ super().__init__(*args, **kwargs)
50
+ # 2.5 or 3
51
+ model_family = self.model_family.model_family or self.model_family.model_name
52
+ self._omni_version = "2.5" if "2.5" in model_family else "3"
53
+
44
54
  @classmethod
45
55
  def match_json(
46
56
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
@@ -48,7 +58,10 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
48
58
  if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
49
59
  return False
50
60
  llm_family = model_family.model_family or model_family.model_name
51
- if "qwen2.5-omni".lower() in llm_family.lower():
61
+ if (
62
+ "qwen2.5-omni".lower() in llm_family.lower()
63
+ or "qwen3-omni".lower() in llm_family.lower()
64
+ ):
52
65
  return True
53
66
  return False
54
67
 
@@ -58,15 +71,25 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
58
71
  self._device = device
59
72
 
60
73
  def load_processor(self):
61
- from transformers import Qwen2_5OmniProcessor
74
+ if self._omni_version == "2.5":
75
+ from transformers import Qwen2_5OmniProcessor as QwenOminiProcessor
76
+ else:
77
+ from transformers import Qwen3OmniMoeProcessor as QwenOminiProcessor
62
78
 
63
- self._processor = Qwen2_5OmniProcessor.from_pretrained(
79
+ self._processor = QwenOminiProcessor.from_pretrained(
64
80
  self.model_path, trust_remote_code=True
65
81
  )
66
82
  self._tokenizer = self._processor.tokenizer
67
83
 
68
84
  def load_multimodal_model(self):
69
- from transformers import Qwen2_5OmniForConditionalGeneration
85
+ if self._omni_version == "2.5":
86
+ from transformers import (
87
+ Qwen2_5OmniForConditionalGeneration as QwenOmniForConditionalGeneration,
88
+ )
89
+ else:
90
+ from transformers import (
91
+ Qwen3OmniMoeForConditionalGeneration as QwenOmniForConditionalGeneration,
92
+ )
70
93
 
71
94
  # for multiple GPU, set back to auto to make multiple devices work
72
95
  device = "auto" if self._device == "cuda" else self._device
@@ -79,7 +102,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
79
102
  kwargs = self.apply_bnb_quantization(kwargs)
80
103
  logger.debug("Loading model with extra kwargs: %s", kwargs)
81
104
 
82
- self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
105
+ self._model = QwenOmniForConditionalGeneration.from_pretrained(
83
106
  self.model_path,
84
107
  torch_dtype="auto",
85
108
  device_map=device,
@@ -181,11 +204,37 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
181
204
  inputs = self.build_inputs_from_messages(messages, generate_config) # type: ignore
182
205
  use_audio_in_video = generate_config.get("use_audio_in_video", True)
183
206
  gen_kwargs = dict(**inputs, **config, use_audio_in_video=use_audio_in_video)
184
- generated_ids, audio = self._model.generate(**gen_kwargs)
185
- generated_ids_trimmed = [
186
- out_ids[len(in_ids) :]
187
- for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
188
- ]
207
+ # === Run model.generate() (handle both (ids, audio) and ids-only cases) ===
208
+ result = self._model.generate(**gen_kwargs)
209
+ if isinstance(result, tuple) and len(result) == 2:
210
+ # Qwen2.5-Omni returns (generated_ids, audio)
211
+ generated_ids, audio = result
212
+ else:
213
+ # Qwen3-Omni returns only generated_ids
214
+ generated_ids, audio = result, None
215
+ if hasattr(generated_ids, "sequences"):
216
+ generated_ids = generated_ids.sequences
217
+
218
+ # === Handle text decoding ===
219
+ input_len = inputs.input_ids.shape[1]
220
+ # Ensure we have a consistent 2D structure
221
+ # Normalize to list[list[int]]
222
+ if isinstance(generated_ids, torch.Tensor):
223
+ generated_ids = generated_ids.tolist()
224
+ elif isinstance(generated_ids, list) and all(
225
+ isinstance(x, int) for x in generated_ids
226
+ ):
227
+ # Single sequence as flat list of ints
228
+ generated_ids = [generated_ids]
229
+ elif isinstance(generated_ids, list) and all(
230
+ isinstance(x, list) for x in generated_ids
231
+ ):
232
+ pass # already correct
233
+ else:
234
+ raise TypeError(f"Unexpected generated_ids type: {type(generated_ids)}")
235
+
236
+ # Remove prompt tokens
237
+ generated_ids_trimmed = [out_ids[input_len:] for out_ids in generated_ids]
189
238
  output_text = self._processor.batch_decode(
190
239
  generated_ids_trimmed,
191
240
  skip_special_tokens=True,
@@ -102,9 +102,9 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
102
102
 
103
103
  kwargs = self.apply_bnb_quantization()
104
104
  llm_family = self.model_family.model_family or self.model_family.model_name
105
- if "qwen2.5" in llm_family:
105
+ if "qwen2.5" in llm_family.lower():
106
106
  model_cls = Qwen2_5_VLForConditionalGeneration
107
- elif "qwen3" in llm_family:
107
+ elif "qwen3" in llm_family.lower():
108
108
  model_cls = AutoModelForImageTextToText
109
109
  else:
110
110
  model_cls = Qwen2VLForConditionalGeneration
@@ -302,6 +302,8 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.11.0"):
302
302
 
303
303
 
304
304
  class VLLMModel(LLM):
305
+ allow_batch = True
306
+
305
307
  def __init__(
306
308
  self,
307
309
  model_uid: str,