PyPI - xinference - Versions diffs - 1.3.0.post2__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

xinference 1.3.0.post2py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (51) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -4523,36 +4523,169 @@
         "model_id": "OpenGVLab/InternVL2_5-1B",
         "model_revision": "master"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-2B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-4B",
+        "model_revision": "master"
+      },
       {
         "model_format": "awq",
-        "model_size_in_billions": 1,
+        "model_size_in_billions": 4,
         "quantizations": [
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-1B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-4B-AWQ",
         "model_revision": "master"
       },
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 2,
+        "model_size_in_billions": 8,
         "quantizations": [
           "4-bit",
           "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-2B",
+        "model_id": "OpenGVLab/InternVL2_5-8B",
         "model_revision": "master"
       },
       {
         "model_format": "awq",
-        "model_size_in_billions": 2,
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-8B-AWQ",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 26,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-26B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 26,
         "quantizations": [
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-2B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-26B-AWQ",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 38,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-38B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 38,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-38B-AWQ",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 78,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-78B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 78,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-78B-AWQ",
+        "model_revision": "master"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [],
+    "stop": []
+  },
+  {
+    "version": 1,
+    "context_length": 16384,
+    "model_name": "InternVL2.5-MPO",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "InternVL 2.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 1,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-1B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-2B",
         "model_revision": "master"
       },
       {
@@ -4564,7 +4697,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-4B",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-4B",
         "model_revision": "master"
       },
       {
@@ -4574,7 +4707,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-4B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-4B-AWQ",
         "model_revision": "master"
       },
       {
@@ -4586,7 +4719,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-8B",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-8B",
         "model_revision": "master"
       },
       {
@@ -4596,7 +4729,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-8B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-8B-AWQ",
         "model_revision": "master"
       },
       {
@@ -4608,7 +4741,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-26B",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-26B",
         "model_revision": "master"
       },
       {
@@ -4618,7 +4751,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-26B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-26B-AWQ",
         "model_revision": "master"
       },
       {
@@ -4630,7 +4763,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-38B",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-38B",
         "model_revision": "master"
       },
       {
@@ -4640,7 +4773,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-38B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-38B-AWQ",
         "model_revision": "master"
       },
       {
@@ -4652,7 +4785,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-78B",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-78B",
         "model_revision": "master"
       },
       {
@@ -4662,7 +4795,7 @@
           "Int4"
         ],
         "model_hub": "modelscope",
-        "model_id": "OpenGVLab/InternVL2_5-78B-AWQ",
+        "model_id": "OpenGVLab/InternVL2_5-MPO-78B-AWQ",
         "model_revision": "master"
       }
     ],
@@ -5020,7 +5153,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id":"qwen/Qwen2.5-VL-3B-Instruct"
+        "model_id":"Qwen/Qwen2.5-VL-3B-Instruct"
       },
       {
         "model_format":"pytorch",
@@ -5029,7 +5162,7 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id":"qwen/Qwen2.5-VL-7B-Instruct"
+        "model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
       },
       {
         "model_format":"pytorch",
@@ -5038,7 +5171,34 @@
           "none"
         ],
         "model_hub": "modelscope",
-        "model_id":"qwen/Qwen2.5-VL-72B-Instruct"
+        "model_id":"Qwen/Qwen2.5-VL-72B-Instruct"
+      },
+      {
+        "model_format":"awq",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"Qwen/Qwen2.5-VL-3B-Instruct-AWQ"
+      },
+      {
+        "model_format":"awq",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_hub": "awq",
+        "model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"Qwen/Qwen2.5-VL-72B-Instruct-AWQ"
       },
       {
         "model_format":"mlx",
@@ -5363,8 +5523,7 @@
       "zh"
     ],
     "model_ability": [
-      "chat",
-      "reasoning"
+      "chat"
     ],
     "model_description": "DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. ",
     "model_specs": [
@@ -5498,15 +5657,13 @@
         "model_hub": "modelscope"
       }
     ],
-    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
     "stop_token_ids": [
       1
     ],
     "stop": [
       "<｜end▁of▁sentence｜>"
-    ],
-    "reasoning_start_tag": "<think>",
-    "reasoning_end_tag": "</think>"
+    ]
   },
   {
     "version": 1,
@@ -5517,7 +5674,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "reasoning"
     ],
     "model_description": "DeepSeek-R1, which incorporates cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1 across math, code, and reasoning tasks.",
     "model_specs": [
@@ -5720,13 +5878,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
     "stop_token_ids": [
       1
     ],
     "stop": [
       "<｜end▁of▁sentence｜>"
-    ]
+    ],
+    "reasoning_start_tag": "<think>",
+    "reasoning_end_tag": "</think>"
   },
   {
     "version": 1,
@@ -7072,6 +7232,86 @@
       "<|im_end|>"
     ]
   },
+{
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "QwQ-32B",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "reasoning"
+    ],
+    "model_description": "QwQ is the reasoning model of the Qwen series. Compared with conventional instruction-tuned models, QwQ, which is capable of thinking and reasoning, can achieve significantly enhanced performance in downstream tasks, especially hard problems. QwQ-32B is the medium-sized reasoning model, which is capable of achieving competitive performance against state-of-the-art reasoning models, e.g., DeepSeek-R1, o1-mini.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/QwQ-32B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/QwQ-32B-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/QwQ-32B-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "fp16",
+          "Q2_k",
+          "Q3_K_M",
+          "Q4_0",
+          "Q4_K_M",
+          "Q5_0",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "Qwen/QwQ-32B-GGUF",
+        "model_file_name_template": "qwq-32b-{quantization}.gguf",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- '' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n  {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" and not message.tool_calls %}\n        {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n<think>\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ],
+    "reasoning_start_tag": "<think>",
+    "reasoning_end_tag": "</think>"
+  },
   {
     "version": 1,
     "context_length": 131072,
@@ -7866,5 +8106,84 @@
       "</s>",
       "<|im_end|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 1010000,
+    "model_name": "qwen2.5-instruct-1m",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen2.5-1M is the long-context version of the Qwen2.5 series models, supporting a context length of up to 1M tokens.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-7B-Instruct-1M",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2.5-14B-Instruct-1M",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "moonlight-16b-a3b-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Kimi Muon is Scalable for LLM Training",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "moonshotai/Moonlight-16B-A3B-Instruct",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template":"{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>{%- endif -%}{%- if message['role'] == 'system' -%}<|im_system|>{%- endif -%}{%- if message['role'] == 'user' -%}<|im_user|>{%- endif -%}{%- if message['role'] == 'assistant' -%}<|im_assistant|>{%- endif -%}{{ message['role'] }}<|im_middle|>{{message['content']}}<|im_end|>{%- endfor -%}{%- if add_generation_prompt -%}<|im_assistant|>assistant<|im_middle|>{%- endif -%}",
+    "stop_token_ids": [
+      163586
+    ],
+    "stop": [
+      "<|im_end|>"
+    ]
   }
 ]

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -45,6 +45,7 @@ class MLXModelConfig(TypedDict, total=False):
     revision: Optional[str]
     max_gpu_memory: str
     trust_remote_code: bool
+    reasoning_content: bool
 class MLXGenerateConfig(TypedDict, total=False):
@@ -95,6 +96,7 @@ class MLXModel(LLM):
             model_config = MLXModelConfig()
         model_config.setdefault("revision", self.model_spec.model_revision)
         model_config.setdefault("trust_remote_code", True)
+        model_config.setdefault("reasoning_content", False)
         return model_config
     def _sanitize_generate_config(
@@ -153,6 +155,9 @@ class MLXModel(LLM):
         )
     def load(self):
+        reasoning_content = self._model_config.pop("reasoning_content")
+        self.prepare_parse_reasoning_content(reasoning_content)
         kwargs = {}
         kwargs["revision"] = self._model_config.get(
             "revision", self.model_spec.model_revision
@@ -445,13 +450,15 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         if stream:
             it = self.generate(full_prompt, generate_config)
             assert isinstance(it, Iterator)
-            return self._to_chat_completion_chunks(it)
+            return self._to_chat_completion_chunks(it, self.reasoning_parser)
         else:
             c = self.generate(full_prompt, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(self.model_family, self.model_uid, c)
-            return self._to_chat_completion(c)
+                return self._post_process_completion(
+                    self.model_family, self.model_uid, c, self.reasoning_parser
+                )
+            return self._to_chat_completion(c, self.reasoning_parser)
 class MLXVisionModel(MLXModel, ChatModelMixin):
@@ -527,6 +534,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
                 text=detokenizer.last_segment,
                 token=token,
                 logprobs=logprobs,
+                from_draft=False,
                 prompt_tokens=len(input_ids),
                 prompt_tps=prompt_tps,
                 generation_tokens=n + 1,
@@ -539,6 +547,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
             text=detokenizer.last_segment,
             token=token,
             logprobs=logprobs,
+            from_draft=False,
             prompt_tokens=len(input_ids),
             prompt_tps=prompt_tps,
             generation_tokens=n + 1,
@@ -634,5 +643,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
             c = self.generate(inputs, generate_config)
             assert not isinstance(c, Iterator)
             if tools:
-                return self._tool_calls_completion(self.model_family, self.model_uid, c)
+                return self._post_process_completion(
+                    self.model_family, self.model_uid, c
+                )
             return self._to_chat_completion(c)

xinference/model/llm/reasoning_parsers/abs_reasoning_parsers.py CHANGED Viewed

@@ -26,7 +26,7 @@ class ReasoningParser(ABC):
         self,
         previous_text: str,
         current_text: str,
-        delta: ChatCompletionChunkDelta,
+        delta_text: str,
     ) -> ChatCompletionChunkDelta:
         """Extract reasoning content from model output in a streaming fashion.

xinference/model/llm/reasoning_parsers/deepseek_r1_reasoning_parser.py CHANGED Viewed

@@ -23,7 +23,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         self,
         previous_text: str,
         current_text: str,
-        delta: ChatCompletionChunkDelta,
+        delta_text: str,
     ) -> ChatCompletionChunkDelta:
         """Extract reasoning content from DeepSeek-R1 model output in a streaming fashion.
@@ -34,10 +34,9 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         Yields:
             str: Extracted reasoning content chunks.
         """
-        if delta is None:
-            return delta
-        delta_text = delta["content"]
+        delta = ChatCompletionChunkDelta(
+            content=delta_text,
+        )
         # Check if <think> is present in previous or delta.
         # Keep compatibility with models that don't generate <think> tokens.

xinference 1.3.0.post2__py3-none-any.whl → 1.3.1__py3-none-any.whl

Potentially problematic release.

xinference 1.3.0.post2py3-none-any.whl → 1.3.1py3-none-any.whl