PyPI - xinference - Versions diffs - 0.16.3__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

xinference 0.16.3py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (69) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -5907,6 +5907,18 @@
     ],
     "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
     "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": "1_5",
@@ -5919,6 +5931,18 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "3",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 7,
@@ -5930,6 +5954,30 @@
         "model_id": "qwen/Qwen2.5-Coder-7B",
         "model_revision": "master",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B",
+        "model_revision": "master",
+        "model_hub": "modelscope"
       }
     ]
   },
@@ -5947,6 +5995,18 @@
     ],
     "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).",
     "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": "1_5",
@@ -5958,6 +6018,17 @@
         "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct",
         "model_revision": "master",
         "model_hub": "modelscope"
+      },      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "3",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
       },
       {
         "model_format": "pytorch",
@@ -5971,6 +6042,63 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B-Instruct",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 7,
@@ -5982,6 +6110,89 @@
         "model_revision": "master",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-{quantization}",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-AWQ",
+        "model_revision": "master",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "1_5",
@@ -6056,5 +6267,313 @@
       "<|im_start|>",
       "<|im_end|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "QwQ-32B-Preview",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/QwQ-32B-Preview",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "okwinds/QwQ-32B-Preview-MLX-4bit",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "okwinds/QwQ-32B-Preview-MLX-8bit",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Q3_K_L",
+          "Q4_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "AI-ModelScope/QwQ-32B-Preview-GGUF",
+        "model_file_name_template": "QwQ-32B-Preview-{quantization}.gguf"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "glm-edge-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "ZhipuAI/glm-edge-1.5b-chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "4",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "ZhipuAI/glm-edge-4b-chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Q4_0",
+          "Q4_1",
+          "Q4_K",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_file_name_template": "ggml-model-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "F16"
+        ],
+        "model_file_name_template": "glm-edge-1.5B-chat-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-1.5b-chat-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "4",
+        "quantizations": [
+          "Q4_0",
+          "Q4_1",
+          "Q4_K",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_file_name_template": "ggml-model-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "4",
+        "quantizations": [
+          "F16"
+        ],
+        "model_file_name_template": "glm-edge-4B-chat-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-4b-chat-gguf"
+      }
+    ],
+    "chat_template": "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
+    "stop_token_ids": [
+      59246,
+      59253,
+      59255
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "glm-edge-v",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "The GLM-Edge series is our attempt to face the end-side real-life scenarios, which consists of two sizes of large-language dialogue models and multimodal comprehension models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, GLM-Edge-V-5B). Among them, the 1.5B / 2B model is mainly for platforms such as mobile phones and cars, and the 4B / 5B model is mainly for platforms such as PCs.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "ZhipuAI/glm-edge-v-2b",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "ZhipuAI/glm-edge-v-5b",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "2",
+        "quantizations": [
+          "Q4_0",
+          "Q4_1",
+          "Q4_K",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_file_name_template": "ggml-model-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "2",
+        "quantizations": [
+          "F16"
+        ],
+        "model_file_name_template": "glm-edge-v-2B-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "2",
+        "quantizations": [
+          "f16"
+        ],
+        "model_file_name_template": "mmproj-model-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-v-2b-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "5",
+        "quantizations": [
+          "Q4_0",
+          "Q4_1",
+          "Q4_K",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_file_name_template": "ggml-model-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "5",
+        "quantizations": [
+          "F16"
+        ],
+        "model_file_name_template": "glm-edge-v-5B-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "5",
+        "quantizations": [
+          "f16"
+        ],
+        "model_file_name_template": "mmproj-model-{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-edge-v-5b-gguf"
+      }
+    ],
+    "chat_template": "{% for item in messages %}{% if item['role'] != 'system' %}<|{{ item['role'] }}|>\n{% for content in item['content'] %}{% if content['type'] == 'image' %}{% for _ in range(578) %}<|begin_of_image|>{% endfor %}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
+    "stop_token_ids": [
+      59246,
+      59253,
+      59255
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ]
   }
 ]

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -17,7 +17,8 @@ import platform
 import sys
 import time
 import uuid
-from typing import Dict, Iterator, List, Optional, TypedDict, Union
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union
 from ....fields import max_tokens_field
 from ....types import (
@@ -53,6 +54,14 @@ class MLXGenerateConfig(TypedDict, total=False):
     stream: bool
     stream_options: Optional[Union[dict, None]]
     tools: Optional[List[Dict]]
+    lora_name: Optional[str]
+@dataclass
+class PromptCache:
+    cache: List[Any] = field(default_factory=list)
+    model_key: Tuple[str, Optional[str]] = ("", None)
+    tokens: List[int] = field(default_factory=list)
 class MLXModel(LLM):
@@ -69,6 +78,8 @@ class MLXModel(LLM):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._use_fast_tokenizer = True
         self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
+        self._max_kv_size = None
+        self._prompt_cache = None
         if peft_model is not None:
             raise ValueError("MLX engine has not supported lora yet")
@@ -127,6 +138,9 @@ class MLXModel(LLM):
             logger.debug(f"Setting cache limit to {cache_limit_gb} GB")
             mx.metal.set_cache_limit(cache_limit_gb * 1024 * 1024 * 1024)
+        self._max_kv_size = kwargs.get("max_kv_size", None)
+        self._prompt_cache = PromptCache()
         return load(
             self.model_path,
             tokenizer_config=tokenizer_config,
@@ -156,6 +170,27 @@ class MLXModel(LLM):
             return False
         return True
+    def _get_prompt_cache(self, prompt, lora_name: Optional[str] = None):
+        from mlx_lm.models.cache import make_prompt_cache
+        assert self._prompt_cache is not None
+        cache_len = len(self._prompt_cache.tokens)
+        model_key = (self.model_path, lora_name)
+        if (
+            self._prompt_cache.model_key != model_key
+            or cache_len >= len(prompt)
+            or self._prompt_cache.tokens != prompt[:cache_len]
+        ):
+            self._prompt_cache.model_key = model_key
+            self._prompt_cache.cache = make_prompt_cache(self._model, self._max_kv_size)
+            self._prompt_cache.tokens = []
+            logger.debug("Making new prompt cache for %s", self.model_uid)
+        else:
+            prompt = prompt[cache_len:]
+            logger.debug("Cache hit for %s", self.model_uid)
+        self._prompt_cache.tokens.extend(prompt)
+        return prompt
     def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
         import mlx.core as mx
         from mlx_lm.utils import generate_step
@@ -167,6 +202,7 @@ class MLXModel(LLM):
         chunk_id = str(uuid.uuid4())
         stop_token_ids = kwargs.get("stop_token_ids", [])
         stream = kwargs.get("stream", False)
+        lora_name = kwargs.get("lora_name")
         stream_options = kwargs.pop("stream_options", None)
         include_usage = (
             stream_options["include_usage"]
@@ -174,12 +210,15 @@ class MLXModel(LLM):
             else False
         )
-        prompt_tokens = mx.array(tokenizer.encode(prompt))
+        prompt_token_ids = tokenizer.encode(prompt)
+        prompt_token_ids = self._get_prompt_cache(prompt_token_ids, lora_name)
+        prompt_tokens = mx.array(prompt_token_ids)
         input_echo_len = len(prompt_tokens)
         i = 0
         start = time.time()
         output = ""
+        tokens = []
         for (token, _), i in zip(
             generate_step(
                 prompt_tokens,
@@ -188,10 +227,11 @@ class MLXModel(LLM):
                 repetition_penalty=kwargs["repetition_penalty"],
                 repetition_context_size=kwargs["repetition_context_size"],
                 top_p=kwargs["top_p"],
-                logit_bias=kwargs["logit_bias"],
+                prompt_cache=self._prompt_cache.cache,  # type: ignore
             ),
             range(max_tokens),
         ):
+            tokens.append(token)
             if token == tokenizer.eos_token_id or token in stop_token_ids:  # type: ignore
                 break
@@ -230,6 +270,8 @@ class MLXModel(LLM):
             f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
         )
+        self._prompt_cache.tokens.extend(tokens)  # type: ignore
         if i == max_tokens - 1:
             finish_reason = "length"
         else:

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -89,6 +89,7 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "deepseek-v2-chat-0628",
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
+    "QwQ-32B-Preview",
 ]

xinference 0.16.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

xinference 0.16.3py3-none-any.whl → 1.0.1py3-none-any.whl