PyPI - xinference - Versions diffs - 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

xinference 1.4.0py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (59) hide show

xinference/_compat.py CHANGED Viewed

@@ -102,6 +102,7 @@ class CreateChatCompletionOpenAI(BaseModel):
     frequency_penalty: Optional[float]
     logit_bias: Optional[Dict[str, int]]
     logprobs: Optional[bool]
+    max_completion_tokens: Optional[int]
     max_tokens: Optional[int]
     n: Optional[int]
     parallel_tool_calls: Optional[bool]

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-03-21T14:33:52+0800",
+ "date": "2025-04-03T21:26:30+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "ac88d425e3d5fc12166e22c4032286327871f5f2",
- "version": "1.4.0"
+ "full-revisionid": "23260be3b917e7a2e8381927721ed3de815c0a99",
+ "version": "1.4.1"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -1952,6 +1952,7 @@ class RESTfulAPI(CancelMixin):
             "logit_bias",
             "logit_bias_type",
             "user",
+            "max_completion_tokens",
         }
         raw_kwargs = {k: v for k, v in raw_body.items() if k not in exclude}
@@ -1964,6 +1965,9 @@ class RESTfulAPI(CancelMixin):
         if body.max_tokens is None:
             kwargs["max_tokens"] = max_tokens_field.default
+        if body.max_completion_tokens is not None:
+            kwargs["max_tokens"] = body.max_completion_tokens
         if body.logit_bias is not None:
             raise HTTPException(status_code=501, detail="Not implemented")

xinference/core/model.py CHANGED Viewed

@@ -185,7 +185,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
                 )
         if hasattr(self._model, "stop") and callable(self._model.stop):
-            self._model.stop()
+            await asyncio.to_thread(self._model.stop)
         if isinstance(self._model, LLMVLLMModel):
             if self._transfer_ref is not None:
@@ -284,6 +284,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
     async def __post_create__(self):
         self._loop = asyncio.get_running_loop()
+        logger.debug("Starting ModelActor at %s, uid: %s", self.address, self.uid)
         self._handle_pending_requests_task = asyncio.create_task(
             self._handle_pending_requests()
         )
@@ -463,7 +465,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         while True:
             i += 1
             try:
-                self._model.load()
+                if hasattr(self._model, "set_loop"):
+                    self._model.set_loop(asyncio.get_running_loop())
+                await asyncio.to_thread(self._model.load)
                 if hasattr(self._model, "driver_info"):
                     self._driver_info = self._model.driver_info
                 break
@@ -490,7 +494,23 @@ class ModelActor(xo.StatelessActor, CancelMixin):
     async def wait_for_load(self):
         if hasattr(self._model, "wait_for_load"):
-            self._model.wait_for_load()
+            await asyncio.to_thread(self._model.wait_for_load)
+    def need_create_pools(self):
+        return getattr(self._model, "need_create_pools", False)
+    def set_pool_addresses(self, pool_addresses: List[str]):
+        if hasattr(self._model, "set_pool_addresses"):
+            self._model.set_pool_addresses(pool_addresses)
+    def get_pool_addresses(self) -> Optional[List[str]]:
+        if hasattr(self._model, "get_pool_addresses"):
+            return self._model.get_pool_addresses()
+        return None
+    def set_worker_addresses(self, shard: int, worker_addresses: List[str]):
+        if hasattr(self._model, "set_worker_addresses"):
+            self._model.set_worker_addresses(shard, worker_addresses)
     def model_uid(self):
         return (

xinference/core/supervisor.py CHANGED Viewed

@@ -1097,6 +1097,7 @@ class SupervisorActor(xo.StatelessActor):
                 xavier_config=xavier_config,
                 **kwargs,
             )
+            await worker_ref.wait_for_load(_replica_model_uid)
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
             return subpool_address
@@ -1242,6 +1243,11 @@ class SupervisorActor(xo.StatelessActor):
                 available_workers.append(worker_ip)
         async def _launch_model():
+            # Validation of n_worker, intercept if it is greater than the available workers.
+            if n_worker > len(available_workers):
+                raise ValueError(
+                    "n_worker cannot be larger than the number of available workers."
+                )
             try:
                 for _idx, rep_model_uid in enumerate(
                     iter_replica_model_uid(model_uid, replica)

xinference/core/worker.py CHANGED Viewed

@@ -874,7 +874,7 @@ class WorkerActor(xo.StatelessActor):
             subpool_address, devices = await self._create_subpool(
                 model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
             )
+            all_subpool_addresses = [subpool_address]
             try:
                 xavier_config: Optional[Dict] = kwargs.pop("xavier_config", None)
                 if xavier_config is not None:
@@ -885,7 +885,7 @@ class WorkerActor(xo.StatelessActor):
                     # add a few kwargs
                     model_kwargs.update(
                         dict(
-                            address=self.address,
+                            address=subpool_address,
                             n_worker=n_worker,
                             shard=shard,
                             driver_info=driver_info,
@@ -923,11 +923,28 @@ class WorkerActor(xo.StatelessActor):
                     shard=shard,
                     driver_info=driver_info,
                 )
+                if await model_ref.need_create_pools() and (
+                    len(devices) > 1 or n_worker > 1  # type: ignore
+                ):
+                    coros = []
+                    env_name = get_available_device_env_name() or "CUDA_VISIBLE_DEVICES"
+                    env_value = ",".join(devices)
+                    for device in devices:
+                        coros.append(
+                            self._main_pool.append_sub_pool(
+                                env={env_name: env_value},
+                                start_method=self._get_start_method(),
+                            )
+                        )
+                    pool_addresses = await asyncio.gather(*coros)
+                    all_subpool_addresses.extend(pool_addresses)
+                    await model_ref.set_pool_addresses(pool_addresses)
                 await model_ref.load()
             except:
                 logger.error(f"Failed to load model {model_uid}", exc_info=True)
                 self.release_devices(model_uid=model_uid)
-                await self._main_pool.remove_sub_pool(subpool_address)
+                for addr in all_subpool_addresses:
+                    await self._main_pool.remove_sub_pool(addr)
                 raise
             self._model_uid_to_model[model_uid] = model_ref
             self._model_uid_to_model_spec[model_uid] = model_description
@@ -994,15 +1011,36 @@ class WorkerActor(xo.StatelessActor):
         if model_ref is None:
             logger.debug("Model not found, uid: %s", model_uid)
+        pool_addresses = None
+        if model_ref is not None:
+            try:
+                # pool addresses if model.need_create_pools()
+                pool_addresses = await model_ref.get_pool_addresses()
+            except Exception as e:
+                # process may disappear, we just ignore it.
+                logger.debug("Fail to get pool addresses, error: %s", e)
         try:
-            await xo.destroy_actor(model_ref)
+            logger.debug("Start to destroy model actor: %s", model_ref)
+            coro = xo.destroy_actor(model_ref)
+            await asyncio.wait_for(coro, timeout=5)
         except Exception as e:
             logger.debug(
                 "Destroy model actor failed, model uid: %s, error: %s", model_uid, e
             )
         try:
+            to_remove_addresses = []
             subpool_address = self._model_uid_to_addr[model_uid]
-            await self._main_pool.remove_sub_pool(subpool_address, force=True)
+            to_remove_addresses.append(subpool_address)
+            if pool_addresses:
+                to_remove_addresses.extend(pool_addresses)
+            logger.debug("Remove sub pools: %s", to_remove_addresses)
+            coros = []
+            for to_remove_addr in to_remove_addresses:
+                coros.append(
+                    self._main_pool.remove_sub_pool(to_remove_addr, force=True)
+                )
+            await asyncio.gather(*coros)
         except Exception as e:
             logger.debug(
                 "Remove sub pool failed, model uid: %s, error: %s", model_uid, e
@@ -1204,18 +1242,23 @@ class WorkerActor(xo.StatelessActor):
         model_ref = self._model_uid_to_model[rep_model_uid]
         await model_ref.start_transfer_for_vllm(rank_addresses)
-    @log_async(logger=logger, level=logging.INFO)
-    async def launch_rank0_model(
-        self, rep_model_uid: str, xavier_config: Dict[str, Any]
-    ) -> Tuple[str, int]:
-        from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
+    @staticmethod
+    def _get_start_method():
         if os.name != "nt" and platform.system() != "Darwin":
             # Linux
             start_method = "forkserver"
         else:
             # Windows and macOS
             start_method = "spawn"
+        return start_method
+    @log_async(logger=logger, level=logging.INFO)
+    async def launch_rank0_model(
+        self, rep_model_uid: str, xavier_config: Dict[str, Any]
+    ) -> Tuple[str, int]:
+        from ..model.llm.vllm.xavier.collective_manager import Rank0ModelActor
+        start_method = self._get_start_method()
         subpool_address = await self._main_pool.append_sub_pool(
             start_method=start_method
         )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -132,7 +132,7 @@ def _install():
     from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel, XllamaCppModel
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
-    from .sglang.core import SGLANGChatModel, SGLANGModel
+    from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
     from .transformers.chatglm import ChatglmPytorchChatModel
     from .transformers.cogagent import CogAgentChatModel
     from .transformers.cogvlm2 import CogVLM2Model
@@ -143,6 +143,7 @@ def _install():
         DeepSeekV2PytorchModel,
     )
     from .transformers.deepseek_vl import DeepSeekVLChatModel
+    from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
     from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
     from .transformers.glm4v import Glm4VModel
     from .transformers.glm_edge_v import GlmEdgeVModel
@@ -173,7 +174,7 @@ def _install():
             XllamaCppModel,
         ]
     )
-    SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
+    SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel, SGLANGVisionModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
     LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
@@ -187,6 +188,7 @@ def _install():
             Qwen2AudioChatModel,
             YiVLChatModel,
             DeepSeekVLChatModel,
+            DeepSeekVL2ChatModel,
             InternVLChatModel,
             PytorchModel,
             CogVLM2Model,

xinference/model/llm/core.py CHANGED Viewed

@@ -54,6 +54,7 @@ class LLM(abc.ABC):
         **kwargs,
     ):
         self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
+        self.raw_model_uid = replica_model_uid
         self.model_family = model_family
         self.model_spec = model_spec
         self.quantization = quantization

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -302,7 +302,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 while (r := q.get()) is not _Done:
                     if type(r) is _Error:
                         raise Exception("Got error in chat stream: %s", r.msg)
-                    yield r
+                    # Get valid keys (O(1) lookup)
+                    chunk_keys = ChatCompletionChunk.__annotations__
+                    # The chunk may contain additional keys (e.g., system_fingerprint),
+                    # which might not conform to OpenAI/DeepSeek formats.
+                    # Filter out keys that are not part of ChatCompletionChunk.
+                    yield {key: r[key] for key in chunk_keys if key in r}
             return self._to_chat_completion_chunks(
                 _to_iterator(), self.reasoning_parser

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -7561,7 +7561,7 @@
         "model_id":"Qwen/Qwen2-VL-7B-Instruct",
         "model_revision":"6010982c1010c3b222fa98afc81575f124aa9bd6"
       },
-        {
+      {
         "model_format":"gptq",
         "model_size_in_billions":7,
         "quantizations":[
@@ -7672,6 +7672,14 @@
         ],
         "model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
       },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":32,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"Qwen/Qwen2.5-VL-32B-Instruct"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":72,
@@ -7696,6 +7704,14 @@
         ],
         "model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
       },
+      {
+        "model_format":"awq",
+        "model_size_in_billions":32,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_id":"Qwen/Qwen2.5-VL-32B-Instruct-AWQ"
+      },
       {
         "model_format":"awq",
         "model_size_in_billions":72,
@@ -10758,5 +10774,105 @@
     "stop": [
       "<|im_end|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "fin-r1",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Fin-R1 is a large language model specifically designed for the field of financial reasoning",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "SUFE-AIFLM-Lab/Fin-R1"
+      },
+      {
+        "model_format":"gptq",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "Int4",
+          "Int8"
+        ],
+        "model_id":"JunHowie/Fin-R1-GPTQ-{quantization}"
+      },
+      {
+        "model_format":"fp8",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "FP8"
+        ],
+        "model_id":"JunHowie/Fin-R1-FP8-Dynamic"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "deepseek-vl2",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "DeepSeek-VL2, an advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl2"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl2-small"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl2-tiny"
+      }
+    ],
+    "chat_template": "",
+    "stop_token_ids": [
+      1
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   }
 ]

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -5399,6 +5399,15 @@
         "model_hub": "modelscope",
         "model_id":"Qwen/Qwen2.5-VL-7B-Instruct"
       },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":32,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"Qwen/Qwen2.5-VL-32B-Instruct"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":72,
@@ -5423,9 +5432,18 @@
         "quantizations":[
           "Int4"
         ],
-        "model_hub": "awq",
+        "model_hub": "modelscope",
         "model_id":"Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
       },
+      {
+        "model_format":"awq",
+        "model_size_in_billions":32,
+        "quantizations":[
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"Qwen/Qwen2.5-VL-32B-Instruct-AWQ"
+      },
       {
         "model_format":"pytorch",
         "model_size_in_billions":72,
@@ -8420,5 +8438,111 @@
     "stop": [
       "<|im_end|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "fin-r1",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Fin-R1 is a large language model specifically designed for the field of financial reasoning",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "AI-ModelScope/Fin-R1",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "JunHowie/Fin-R1-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "FP8"
+        ],
+        "model_id": "JunHowie/Fin-R1-FP8-Dynamic",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+    "stop_token_ids": [
+      151643,
+      151644,
+      151645
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|im_start|>",
+      "<|im_end|>"
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "deepseek-vl2",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "DeepSeek-VL2, an advanced series of large Mixture-of-Experts (MoE) Vision-Language Models that significantly improves upon its predecessor, DeepSeek-VL. DeepSeek-VL2 demonstrates superior capabilities across various tasks, including but not limited to visual question answering, optical character recognition, document/table/chart understanding, and visual grounding.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl2",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl2-small",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 3,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl2-tiny",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "",
+    "stop_token_ids": [
+      1
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
   }
 ]

xinference/model/llm/reasoning_parser.py CHANGED Viewed

@@ -43,7 +43,7 @@ class ReasoningParser:
                 reasoning_content = delta_text[:end_idx]
                 content = delta_text[end_idx + len(self.reasoning_end_tag) :]
                 delta["reasoning_content"] = reasoning_content
-                if content is not None:
+                if content:
                     delta["content"] = content
                 else:
                     delta["content"] = None
@@ -71,7 +71,7 @@ class ReasoningParser:
                 ]
                 content = delta_text[end_idx + len(self.reasoning_end_tag) :]
                 delta["reasoning_content"] = reasoning_content
-                if content is not None:
+                if content:
                     delta["content"] = content
                 else:
                     delta["content"] = None
@@ -93,7 +93,7 @@ class ReasoningParser:
                 reasoning_content = delta_text[:end_idx]
                 content = delta_text[end_idx + len(self.reasoning_end_tag) :]
                 delta["reasoning_content"] = reasoning_content
-                if content is not None:
+                if content:
                     delta["content"] = content
                 else:
                     delta["content"] = None

xinference 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

Potentially problematic release.

xinference 1.4.0py3-none-any.whl → 1.4.1py3-none-any.whl