PyPI - xinference - Versions diffs - 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

xinference 1.3.1py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (45) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -3738,6 +3738,241 @@
       "<start_of_turn>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "gemma-3-1b-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-1b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-1b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-1b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-1b-it-{quantization}",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
+    "stop_token_ids": [
+      1,
+      106,
+      107
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "gemma-3-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-4b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-12b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-3-27b-it",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-4b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-4b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-12b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-12b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "bf16"
+        ],
+        "model_id": "bartowski/google_gemma-3-27b-it-GGUF",
+        "model_file_name_template": "google_gemma-3-27b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-4b-it-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-12b-it-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "4bit",
+          "6bit",
+          "8bit",
+          "fp16"
+        ],
+        "model_id": "mlx-community/gemma-3-27b-it-{quantization}",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
+    "stop_token_ids": [
+      1,
+      106,
+      107
+    ],
+    "stop": [
+      "<eos>",
+      "<end_of_turn>",
+      "<start_of_turn>"
+    ]
+  },
   {
     "version":1,
     "context_length":2048,
@@ -4673,7 +4908,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-1B",
+        "model_id": "OpenGVLab/InternVL2_5-1B-MPO",
         "model_revision": "master"
       },
       {
@@ -4685,7 +4920,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-2B",
+        "model_id": "OpenGVLab/InternVL2_5-2B-MPO",
         "model_revision": "master"
       },
       {
@@ -4697,7 +4932,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-4B",
+        "model_id": "OpenGVLab/InternVL2_5-4B-MPO",
         "model_revision": "master"
       },
       {
@@ -4707,7 +4942,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-4B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-4B-MPO-AWQ",
         "model_revision": "master"
       },
       {
@@ -4719,7 +4954,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-8B",
+        "model_id": "OpenGVLab/InternVL2_5-8B-MPO",
         "model_revision": "master"
       },
       {
@@ -4729,7 +4964,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-8B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-8B-MPO-AWQ",
         "model_revision": "master"
       },
       {
@@ -4741,7 +4976,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-26B",
+        "model_id": "OpenGVLab/InternVL2_5-26B-MPO",
         "model_revision": "master"
       },
       {
@@ -4751,7 +4986,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-26B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-26B-MPO-AWQ",
         "model_revision": "master"
       },
       {
@@ -4763,7 +4998,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-38B",
+        "model_id": "OpenGVLab/InternVL2_5-38B-MPO",
         "model_revision": "master"
       },
       {
@@ -4773,7 +5008,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-38B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-38B-MPO-AWQ",
         "model_revision": "master"
       },
       {
@@ -4785,7 +5020,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-78B",
+        "model_id": "OpenGVLab/InternVL2_5-78B-MPO",
         "model_revision": "master"
       },
       {
@@ -4795,7 +5030,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-MPO-78B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-78B-MPO-AWQ",
         "model_revision": "master"
       }
     ],
@@ -5657,7 +5892,7 @@
         "model_hub": "modelscope"
       }
     ],
-    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "chat_template": "{% if messages %} {% if system or tools %} {% if system %} {{ system }} {% endif %} {% if tools %} {# Handle tools here if needed #} {% endif %} {% endif %} {% for message in messages %} {% set last = loop.index == loop.length %} {% if message.role == \"user\" %} <｜User｜> {% if tools and last %} Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.  Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}. Do not use variables.  {{ tools }} {% endif %} {{ message.content }} {% if last %} <｜Assistant｜> {% endif %} {% elif message.role == \"assistant\" %} <｜Assistant｜> {% if message.tool_calls %} <｜tool▁calls▁begin｜> {% for tool in message.tool_calls %} <｜tool▁call▁begin｜> {\"name\": \"{{ tool.function.name }}\", \"parameters\": {{ tool.function.arguments }}} <｜tool▁call▁end｜> {% endfor %} <｜tool▁calls▁end｜> {% else %} {{ message.content }} {% if not last %} <｜end▁of▁sentence｜> {% endif %} {% endif %} {% elif message.role == \"tool\" %} <｜tool▁outputs▁begin｜> <｜tool▁output▁begin｜> {{ message.content }} <｜tool▁output▁end｜> <｜tool▁outputs▁end｜> {% if last and message.role != \"assistant\" %} <｜Assistant｜> {% endif %} {% endif %} {% endfor %} {% else %} {% if system %} {{ system }} {% endif %} {% if prompt %} <｜User｜> {{ prompt }} {% endif %} <｜Assistant｜> {{ response }} {% if response %} {{ response }} {% endif %} {% endif %}",
     "stop_token_ids": [
       1
     ],
@@ -7217,7 +7452,7 @@
         ],
         "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
         "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf",
-	"model_hub": "modelscope"
+        "model_hub": "modelscope"
       }
     ],
     "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
@@ -7234,7 +7469,7 @@
   },
 {
     "version": 1,
-    "context_length": 32768,
+    "context_length": 131072,
     "model_name": "QwQ-32B",
     "model_lang": [
       "en",
@@ -7284,14 +7519,14 @@
         "model_size_in_billions": 32,
         "quantizations": [
           "fp16",
-          "Q2_k",
-          "Q3_K_M",
-          "Q4_0",
-          "Q4_K_M",
-          "Q5_0",
-          "Q5_K_M",
-          "Q6_K",
-          "Q8_0"
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
         ],
         "model_id": "Qwen/QwQ-32B-GGUF",
         "model_file_name_template": "qwq-32b-{quantization}.gguf",

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -148,11 +148,15 @@ class MLXModel(LLM):
         self._max_kv_size = kwargs.get("max_kv_size", None)
         self._prompt_cache = PromptCache()
-        return load(
+        model, tokenizer = load(
             self.model_path,
             tokenizer_config=tokenizer_config,
             model_config=self._model_config,
         )
+        if stop_token_ids := self.model_family.stop_token_ids:
+            for stop_token_id in stop_token_ids:
+                tokenizer.add_eos_token(stop_token_id)
+        return model, tokenizer
     def load(self):
         reasoning_content = self._model_config.pop("reasoning_content")
@@ -260,7 +264,7 @@ class MLXModel(LLM):
         start = time.time()
         output = ""
         tokens = []
-        for chunk_resp, i in zip(
+        for i, chunk_resp in enumerate(
             self._generate_stream_inner(
                 prompt_token_ids=prompt_token_ids,
                 max_tokens=max_tokens,
@@ -269,8 +273,7 @@ class MLXModel(LLM):
                 repetition_penalty=kwargs["repetition_penalty"],
                 repetition_context_size=kwargs["repetition_context_size"],
                 prompt_cache=self._prompt_cache.cache if self._prompt_cache else None,  # type: ignore
-            ),
-            range(max_tokens),
+            )
         ):
             token = chunk_resp.token
             tokens.append(token)
@@ -435,10 +438,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         tools = generate_config.pop("tools", []) if generate_config else None
         full_context_kwargs = {}
         if tools:
-            if model_family in QWEN_TOOL_CALL_FAMILY:
+            if (
+                model_family in QWEN_TOOL_CALL_FAMILY
+                or model_family in DEEPSEEK_TOOL_CALL_FAMILY
+            ):
                 full_context_kwargs["tools"] = tools
-            elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
-                self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
@@ -507,19 +511,19 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         from mlx_lm.utils import GenerationResponse
         from mlx_vlm.utils import generate_step
-        inputs = kwargs["prompt_token_ids"]
+        inputs = kwargs.pop("prompt_token_ids")
-        max_tokens = kwargs.pop("max_tokens")
+        extra_kwargs = kwargs.copy()
         input_ids, pixel_values, mask, kwargs = inputs
+        kwargs.update(extra_kwargs)
         tokenizer = self._processor.tokenizer
         detokenizer = self._processor.detokenizer
         detokenizer.reset()
         tic = time.perf_counter()
-        for (token, logprobs), n in zip(
+        for n, (token, logprobs) in enumerate(
             generate_step(input_ids, self._model, pixel_values, mask, **kwargs),
-            range(max_tokens),
         ):
             if n == 0:
                 prompt_time = time.perf_counter() - tic

xinference/model/llm/{reasoning_parsers/deepseek_r1_reasoning_parser.py → reasoning_parser.py} RENAMED Viewed

@@ -1,20 +1,17 @@
 import re
 from typing import Optional, Tuple, Union
-from ....types import ChatCompletionChunkDelta, CompletionChoice
-from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from ...types import ChatCompletionChunkDelta, CompletionChoice
-@ReasoningParserManager.register_module("deepseek-v3")
-@ReasoningParserManager.register_module("deepseek-r1-distill-qwen")
-@ReasoningParserManager.register_module("deepseek-r1-distill-llama")
-class DeepSeekR1ReasoningParser(ReasoningParser):
-    """Reasoning parser for DeepSeek-R1 model."""
+class ReasoningParser:
+    """Reasoning parser for reasoning model."""
     def __init__(
         self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
     ):
-        super().__init__(reasoning_start_tag, reasoning_end_tag)
+        self.reasoning_start_tag = reasoning_start_tag
+        self.reasoning_end_tag = reasoning_end_tag
         self.reasoning_regex = re.compile(
             rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
         )
@@ -34,9 +31,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         Yields:
             str: Extracted reasoning content chunks.
         """
-        delta = ChatCompletionChunkDelta(
-            content=delta_text,
-        )
+        delta = ChatCompletionChunkDelta()
         # Check if <think> is present in previous or delta.
         # Keep compatibility with models that don't generate <think> tokens.
@@ -50,17 +45,21 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 delta["reasoning_content"] = reasoning_content
                 if content is not None:
                     delta["content"] = content
+                else:
+                    delta["content"] = None
                 return delta
             elif self.reasoning_end_tag in previous_text:
                 # <think> in previous, </think> in previous,
                 # <think> in previous, </think> in previous,
                 # reasoning content ends
+                delta["reasoning_content"] = None
+                delta["content"] = delta_text
                 return delta
             else:
                 # <think> in previous, no </think> in previous or delta,
                 # reasoning content continues
                 delta["reasoning_content"] = delta_text
-                delta["content"] = ""
+                delta["content"] = None
                 return delta
         elif self.reasoning_start_tag in delta_text:
             if self.reasoning_end_tag in delta_text:
@@ -74,12 +73,14 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 delta["reasoning_content"] = reasoning_content
                 if content is not None:
                     delta["content"] = content
+                else:
+                    delta["content"] = None
                 return delta
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
                 delta["reasoning_content"] = delta_text
-                delta["content"] = ""
+                delta["content"] = None
                 return delta
         else:
             # No <think> in previous or delta, also need to check for </think>.
@@ -94,14 +95,18 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 delta["reasoning_content"] = reasoning_content
                 if content is not None:
                     delta["content"] = content
+                else:
+                    delta["content"] = None
                 return delta
             elif self.reasoning_end_tag in previous_text:
                 # </think> in previous, thinking content ends
+                delta["reasoning_content"] = None
+                delta["content"] = delta_text
                 return delta
             else:
                 # no </think> in previous or delta, reasoning content continues
                 delta["reasoning_content"] = delta_text
-                delta["content"] = ""
+                delta["content"] = None
                 return delta
     def extract_reasoning_content(

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -94,6 +94,8 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "mixtral-instruct-v0.1",
     "gemma-it",
     "gemma-2-it",
+    "gemma-3-1b-it",
+    "gemma-3-it",
     "deepseek-v2.5",
     "deepseek-v2-chat",
     "deepseek-v2-chat-0628",

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -79,6 +79,8 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "glm-edge-v",
     "QvQ-72B-Preview",
     "cogagent",
+    "gemma-3-1b-it",
+    "gemma-3-it",
 ]
@@ -691,10 +693,9 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             tools
             and model_family in QWEN_TOOL_CALL_FAMILY
             or model_family in LLAMA3_TOOL_CALL_FAMILY
+            or model_family in DEEPSEEK_TOOL_CALL_FAMILY
         ):
             full_context_kwargs["tools"] = tools
-        elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
-            self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages,

xinference 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

Potentially problematic release.

xinference 1.3.1py3-none-any.whl → 1.4.0py3-none-any.whl