PyPI - xinference - Versions diffs - 1.11.0.post1__py3-none-any.whl → 1.12.0__py3-none-any.whl - Mend

xinference 1.11.0.post1py3-none-any.whl → 1.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (39) hide show

xinference/model/llm/core.py CHANGED Viewed

@@ -45,6 +45,8 @@ def get_llm_version_infos():
 class LLM(abc.ABC):
+    allow_batch = False
     def __init__(
         self,
         replica_model_uid: str,

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -40,6 +40,8 @@ class _Error:
 class XllamaCppModel(LLM, ChatModelMixin):
+    allow_batch = True
     def __init__(
         self,
         model_uid: str,

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -22085,6 +22085,208 @@
             "model_id": "cpatonn-mirror/Qwen3-VL-30B-A3B-Instruct-AWQ-{quantization}"
           }
         }
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-32B-Instruct"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-32B-Instruct"
+          }
+        }
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 32,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-32B-Instruct-FP8"
+          },
+          "modelscope": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-32B-Instruct-FP8"
+          }
+        }
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "Int4"
+            ],
+            "model_id": "QuantTrio/Qwen3-VL-32B-Instruct-AWQ"
+          },
+          "modelscope": {
+            "quantizations": [
+              "Int4"
+            ],
+            "model_id": "tclf90/Qwen3-VL-32B-Instruct-AWQ"
+          }
+        }
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-8B-Instruct"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-8B-Instruct"
+          }
+        }
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 8,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-8B-Instruct-FP8"
+          },
+          "modelscope": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-8B-Instruct-FP8"
+          }
+        }
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 8,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "4bit",
+              "8bit"
+            ],
+            "model_id": "cpatonn/Qwen3-VL-8B-Instruct-AWQ-{quantization}"
+          },
+          "modelscope": {
+            "quantizations": [
+              "4bit",
+              "8bit"
+            ],
+            "model_id": "cpatonn-mirror/Qwen3-VL-8B-Instruct-AWQ-{quantization}"
+          }
+        }
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-4B-Instruct"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-4B-Instruct"
+          }
+        }
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 4,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-4B-Instruct-FP8"
+          },
+          "modelscope": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-4B-Instruct-FP8"
+          }
+        }
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 4,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "4bit",
+              "8bit"
+            ],
+            "model_id": "cpatonn/Qwen3-VL-4B-Instruct-AWQ-{quantization}"
+          },
+          "modelscope": {
+            "quantizations": [
+              "4bit",
+              "8bit"
+            ],
+            "model_id": "cpatonn-mirror/Qwen3-VL-4B-Instruct-AWQ-{quantization}"
+          }
+        }
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 2,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-2B-Instruct"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "Qwen/Qwen3-VL-2B-Instruct"
+          }
+        }
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 2,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-2B-Instruct-FP8"
+          },
+          "modelscope": {
+            "quantizations": [
+              "fp8"
+            ],
+            "model_id": "Qwen/Qwen3-VL-2B-Instruct-FP8"
+          }
+        }
       }
     ],
     "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -22579,14 +22781,14 @@
           "huggingface": {
             "quantizations": [
               "4bit",
-	          "8bit"
+              "8bit"
             ],
             "model_id": "cpatonn/Qwen3-Omni-30B-A3B-Thinking-AWQ-{quantization}"
           },
           "modelscope": {
             "quantizations": [
               "4bit",
-	          "8bit"
+              "8bit"
             ],
             "model_id": "cpatonn-mirror/Qwen3-Omni-30B-A3B-Thinking-AWQ-{quantization}"
           }
@@ -22604,7 +22806,15 @@
     ],
     "reasoning_start_tag": "<think>",
     "reasoning_end_tag": "</think>",
-    "tool_parser":"qwen"
+    "tool_parser": "qwen",
+    "virtualenv": {
+      "packages": [
+        "transformers==4.57.1",
+        "#system_numpy#",
+        "qwen_omni_utils",
+        "soundfile"
+      ]
+    }
   },
   {
     "version": 2,
@@ -22650,14 +22860,14 @@
           "huggingface": {
             "quantizations": [
               "4bit",
-	          "8bit"
+              "8bit"
             ],
             "model_id": "cpatonn/Qwen3-Omni-30B-A3B-Instruct-AWQ-{quantization}"
           },
           "modelscope": {
             "quantizations": [
               "4bit",
-	          "8bit"
+              "8bit"
             ],
             "model_id": "cpatonn-mirror/Qwen3-Omni-30B-A3B-Instruct-AWQ-{quantization}"
           }
@@ -22673,6 +22883,109 @@
       "<|endoftext|>",
       "<|im_end|>"
     ],
-    "tool_parser":"qwen"
+    "tool_parser": "qwen",
+    "virtualenv": {
+      "packages": [
+        "transformers==4.57.1",
+        "#system_numpy#",
+        "qwen_omni_utils",
+        "soundfile"
+      ]
+    }
+  },
+  {
+    "model_name": "MiniMax-M2",
+    "model_description": "MiniMax-M2, a Mini model built for Max coding & agentic workflows.",
+    "context_length": 196608,
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools",
+      "reasoning"
+    ],
+    "model_specs": [
+      {
+        "model_size_in_billions": 230,
+        "activated_size_in_billions": 3,
+        "model_format": "pytorch",
+        "model_src": {
+          "huggingface": {
+            "model_id": "MiniMaxAI/MiniMax-M2",
+            "quantizations": [
+              "none"
+            ]
+          },
+          "modelscope": {
+            "model_id": "MiniMax/MiniMax-M2",
+            "quantizations": [
+              "none"
+            ]
+          }
+        }
+      },
+      {
+        "model_size_in_billions": 230,
+        "activated_size_in_billions": 3,
+        "model_format": "awq",
+        "model_src": {
+          "huggingface": {
+            "model_id": "QuantTrio/MiniMax-M2-AWQ",
+            "quantizations": [
+              "Int4"
+            ]
+          },
+          "modelscope": {
+            "model_id": "tclf90/MiniMax-M2-AWQ",
+            "quantizations": [
+              "Int4"
+            ]
+          }
+        }
+      },
+      {
+        "model_size_in_billions": 230,
+        "activated_size_in_billions": 3,
+        "model_format": "mlx",
+        "model_src": {
+          "huggingface": {
+            "model_id": "mlx-community/MiniMax-M2-{quantization}",
+            "quantizations": [
+              "3bit",
+              "4bit",
+              "5bit",
+              "6bit",
+              "8bit"
+            ]
+          },
+          "modelscope": {
+            "model_id": "mlx-community/MiniMax-M2-{quantization}",
+            "quantizations": [
+              "3bit",
+              "4bit",
+              "5bit",
+              "6bit",
+              "8bit"
+            ]
+          }
+        }
+      }
+    ],
+    "chat_template": "{# ----------‑‑‑ special token variables ‑‑‑---------- #}\n{%- set toolcall_begin_token   = '<minimax:tool_call>'         -%}\n{%- set toolcall_end_token     = '</minimax:tool_call>'        -%}\n{#- Tool Rendering Functions ============================================== -#}\n{%- macro render_tool_namespace(namespace_name, tool_list) -%}\n{%- for tool in tool_list -%}\n<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>\n{% endfor -%}\n{%- endmacro -%}\n{%- macro visible_text(content) -%}\n    {%- if content is string -%}\n        {{ content }}\n    {%- elif content is iterable and content is not mapping -%}\n        {%- for item in content -%}\n            {%- if item is mapping and item.type == 'text' -%}\n                {{- item.text }}\n            {%- elif item is string -%}\n                {{- item }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{- content }}\n    {%- endif -%}\n{%- endmacro -%}\n{#- System Message Construction ============================================ -#}\n{%- macro build_system_message(system_message) -%}\n    {%- if system_message and system_message.content -%}\n        {{- visible_text(system_message.content) }}\n    {%- else -%}\n        {%- if model_identity is not defined -%}\n            {%- set model_identity = \"You are a helpful assistant.\" -%}\n        {%- endif -%}\n        {{- model_identity }}\n    {%- endif -%}\n    \n    {#- Handle current_date -#}\n    {%- if system_message and system_message.current_date -%}\n        {{- '\\n' ~ 'Current date: ' + system_message.current_date }}\n    {%- endif -%}\n    {#- Handle current_location -#}\n    {%- if system_message and system_message.current_location -%}\n        {{- '\\n' ~ 'Current location: ' + system_message.current_location }}\n    {%- endif -%}\n{%- endmacro -%}\n{#- Main Template Logic ================================================= -#}\n{#- Extract system message (only first message if it's system) -#}\n{%- set system_message = none -%}\n{%- set conversation_messages = messages -%}\n{%- if messages and messages[0].role == \"system\" -%}\n    {%- set system_message = messages[0] -%}\n    {%- set conversation_messages = messages[1:] -%}\n{%- endif -%}\n{#- Get the last user message turn, for interleved thinking -#}\n{%- set ns = namespace(last_user_index=-1) %}\n{% for m in conversation_messages %}\n    {%- if m.role == 'user' %}\n        {% set ns.last_user_index = loop.index0 -%}\n    {%- endif %}\n{%- endfor %}\n{#- Render system message -#}\n{{- ']~!b[' ~ ']~b]system' ~ '\\n' }}\n{{- build_system_message(system_message) }}\n{#- Render tools if available -#}\n{%- if tools -%}\n    {{- '\\n\\n' ~ '# Tools' ~ '\\n' ~ 'You may call one or more tools to assist with the user query.\\nHere are the tools available in JSONSchema format:' ~ '\\n' }}\n    {{- '\\n' ~ '<tools>' ~ '\\n' }}\n    {{- render_tool_namespace(\"functions\", tools) }}\n    {{- '</tools>' ~ '\\n\\n' }}\n{{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\\n' }}\n{{- '\\n' ~ toolcall_begin_token }}\n<invoke name=\"tool-name-1\">\n<parameter name=\"param-key-1\">param-value-1</parameter>\n<parameter name=\"param-key-2\">param-value-2</parameter>\n...\n</invoke>\n{{- '\\n' ~ toolcall_end_token }}\n{%- endif -%}\n{{- '[e~[\\n' }}\n\n{#- Render messages -#}\n{%- set last_tool_call = namespace(name=none) -%}\n{%- for message in conversation_messages -%}\n    {%- if message.role == 'assistant' -%}\n        {#- Only render reasoning_content if no user message follows -#}\n        {{- ']~b]ai' ~ '\\n' }}\n\n        {%- set reasoning_content = '' %}\n        {%- set content = visible_text(message.content) %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].strip('\\n').split('<think>')[-1].strip('\\n') %}\n                {%- set content = content.split('</think>')[-1].strip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if reasoning_content and loop.index0 > ns.last_user_index -%}\n            {{- '<think>' ~ '\\n' ~ reasoning_content ~ '\\n' ~ '</think>' ~ '\\n\\n' }}\n        {%- endif -%}\n        {%- if content -%}\n            {{- content }}\n        {%- endif -%}\n        {%- if message.tool_calls -%}\n            {{- '\\n' ~ toolcall_begin_token ~ '\\n' }}\n\n            {%- for tool_call in message.tool_calls -%}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<invoke name=\"' + tool_call.name + '\">' }}\n                {% set _args = tool_call.arguments %}\n                {%- for k, v in _args.items() %}\n                {{- '<parameter name=\"' + k + '\">' }}\n                {{- v | tojson(ensure_ascii=False) if v is not string else v }}\n                {{- '</parameter>' }}\n                {% endfor %}\n                {{- '</invoke>' ~ '\\n' }}\n            {%- endfor -%}\n            \n            {{- toolcall_end_token}}\n            {%- set last_tool_call.name = message.tool_calls[-1].name -%}\n        {%- else -%}\n            {%- set last_tool_call.name = none -%}\n        {%- endif -%}\n        {{- '[e~[' ~ '\\n' }}\n        \n    {%- elif message.role == 'tool' -%}\n    {%- if last_tool_call.name is none -%}\n        {{- raise_exception(\"Message has tool role, but there was no previous assistant message with a tool call!\") }}\n    {%- endif -%}\n    {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}\n        {{- ']~b]tool' }}\n    {%- endif -%}\n    {%- if message.content is string -%}\n        {{- '\\n<response>' }}\n        {{- message.content }}\n        {{- '</response>' }}\n    {%- else -%}\n        {%- for tr in message.content -%}\n            {{- '\\n<response>' }}\n            {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}\n            {{- '\\n</response>' }}\n        {%- endfor -%}\n    {%- endif -%}\n    {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}\n        {{- '[e~[\\n' -}}\n    {%- endif -%}\n        \n    {%- elif message.role == 'user' -%}\n        {{- ']~b]user' ~ '\\n' }}\n        {{- visible_text(message.content) }}\n        {{- '[e~[' ~ '\\n' }}\n    {%- endif -%}\n{%- endfor -%}\n\n{#- Generation prompt -#}\n{%- if add_generation_prompt -%}\n{{- ']~b]ai' ~ '\\n' ~ '<think>' ~ '\\n' }}\n{%- endif -%}",
+    "stop_token_ids": [
+      200020
+    ],
+    "stop": [
+      "[e~["
+    ],
+    "reasoning_start_tag": "<think>",
+    "reasoning_end_tag": "</think>",
+    "tool_parser": "minimax",
+    "version": 2,
+    "virtualenv": {
+      "packages": []
+    }
   }
 ]

xinference/model/llm/lmdeploy/core.py CHANGED Viewed

@@ -73,6 +73,8 @@ class LMDeployGenerateConfig(TypedDict, total=False):
 class LMDeployModel(LLM):
+    allow_batch = True
     def __init__(
         self,
         model_uid: str,

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -137,6 +137,8 @@ SGLANG_SUPPORTED_VISION_MODEL_LIST = [
 class SGLANGModel(LLM):
+    allow_batch = True
     def __init__(
         self,
         model_uid: str,

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -91,6 +91,8 @@ def register_non_default_model(*model_names: str):
 class PytorchModel(LLM):
+    allow_batch = True
     def __init__(
         self,
         model_uid: str,

xinference/model/llm/transformers/multimodal/qwen-omni.py CHANGED Viewed

@@ -19,6 +19,8 @@ import uuid
 from threading import Thread
 from typing import Any, Dict, Iterator, List, Optional, Tuple
+import torch
 from .....types import (
     ChatCompletion,
     ChatCompletionAudio,
@@ -35,12 +37,20 @@ logger = logging.getLogger(__name__)
 @register_transformer
 @register_non_default_model("qwen2.5-omni")
-class Qwen2_5OmniChatModel(PytorchMultiModalModel):
+@register_non_default_model("Qwen3-Omni-Thinking")
+@register_non_default_model("Qwen3-Omni-Instruct")
+class QwenOmniChatModel(PytorchMultiModalModel):
     DEFAULT_SYSTEM_PROMPT = (
         "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
         "capable of perceiving auditory and visual inputs, as well as generating text and speech."
     )
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # 2.5 or 3
+        model_family = self.model_family.model_family or self.model_family.model_name
+        self._omni_version = "2.5" if "2.5" in model_family else "3"
     @classmethod
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
@@ -48,7 +58,10 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
         if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
-        if "qwen2.5-omni".lower() in llm_family.lower():
+        if (
+            "qwen2.5-omni".lower() in llm_family.lower()
+            or "qwen3-omni".lower() in llm_family.lower()
+        ):
             return True
         return False
@@ -58,15 +71,25 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
         self._device = device
     def load_processor(self):
-        from transformers import Qwen2_5OmniProcessor
+        if self._omni_version == "2.5":
+            from transformers import Qwen2_5OmniProcessor as QwenOminiProcessor
+        else:
+            from transformers import Qwen3OmniMoeProcessor as QwenOminiProcessor
-        self._processor = Qwen2_5OmniProcessor.from_pretrained(
+        self._processor = QwenOminiProcessor.from_pretrained(
             self.model_path, trust_remote_code=True
         )
         self._tokenizer = self._processor.tokenizer
     def load_multimodal_model(self):
-        from transformers import Qwen2_5OmniForConditionalGeneration
+        if self._omni_version == "2.5":
+            from transformers import (
+                Qwen2_5OmniForConditionalGeneration as QwenOmniForConditionalGeneration,
+            )
+        else:
+            from transformers import (
+                Qwen3OmniMoeForConditionalGeneration as QwenOmniForConditionalGeneration,
+            )
         # for multiple GPU, set back to auto to make multiple devices work
         device = "auto" if self._device == "cuda" else self._device
@@ -79,7 +102,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
         kwargs = self.apply_bnb_quantization(kwargs)
         logger.debug("Loading model with extra kwargs: %s", kwargs)
-        self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+        self._model = QwenOmniForConditionalGeneration.from_pretrained(
             self.model_path,
             torch_dtype="auto",
             device_map=device,
@@ -181,11 +204,37 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
         inputs = self.build_inputs_from_messages(messages, generate_config)  # type: ignore
         use_audio_in_video = generate_config.get("use_audio_in_video", True)
         gen_kwargs = dict(**inputs, **config, use_audio_in_video=use_audio_in_video)
-        generated_ids, audio = self._model.generate(**gen_kwargs)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
+        # === Run model.generate() (handle both (ids, audio) and ids-only cases) ===
+        result = self._model.generate(**gen_kwargs)
+        if isinstance(result, tuple) and len(result) == 2:
+            # Qwen2.5-Omni returns (generated_ids, audio)
+            generated_ids, audio = result
+        else:
+            # Qwen3-Omni returns only generated_ids
+            generated_ids, audio = result, None
+        if hasattr(generated_ids, "sequences"):
+            generated_ids = generated_ids.sequences
+        # === Handle text decoding ===
+        input_len = inputs.input_ids.shape[1]
+        # Ensure we have a consistent 2D structure
+        # Normalize to list[list[int]]
+        if isinstance(generated_ids, torch.Tensor):
+            generated_ids = generated_ids.tolist()
+        elif isinstance(generated_ids, list) and all(
+            isinstance(x, int) for x in generated_ids
+        ):
+            # Single sequence as flat list of ints
+            generated_ids = [generated_ids]
+        elif isinstance(generated_ids, list) and all(
+            isinstance(x, list) for x in generated_ids
+        ):
+            pass  # already correct
+        else:
+            raise TypeError(f"Unexpected generated_ids type: {type(generated_ids)}")
+        # Remove prompt tokens
+        generated_ids_trimmed = [out_ids[input_len:] for out_ids in generated_ids]
         output_text = self._processor.batch_decode(
             generated_ids_trimmed,
             skip_special_tokens=True,

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -102,9 +102,9 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
         kwargs = self.apply_bnb_quantization()
         llm_family = self.model_family.model_family or self.model_family.model_name
-        if "qwen2.5" in llm_family:
+        if "qwen2.5" in llm_family.lower():
             model_cls = Qwen2_5_VLForConditionalGeneration
-        elif "qwen3" in llm_family:
+        elif "qwen3" in llm_family.lower():
             model_cls = AutoModelForImageTextToText
         else:
             model_cls = Qwen2VLForConditionalGeneration

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -302,6 +302,8 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.11.0"):
 class VLLMModel(LLM):
+    allow_batch = True
     def __init__(
         self,
         model_uid: str,

xinference 1.11.0.post1__py3-none-any.whl → 1.12.0__py3-none-any.whl

Potentially problematic release.

xinference 1.11.0.post1py3-none-any.whl → 1.12.0py3-none-any.whl