PyPI - xinference - Versions diffs - 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/transformers/multimodal/glm4_1v.py ADDED Viewed

@@ -0,0 +1,167 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from threading import Thread
+from typing import Any, Dict, Iterator, List, Tuple
+import torch
+from .....model.utils import select_device
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
+from ...utils import _decode_image
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
+logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("glm-4.1v-thinking")
+class Glm4_1VModel(PytorchMultiModalModel):
+    @classmethod
+    def match_json(
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        family = model_family.model_family or model_family.model_name
+        if "glm-4.1v" in family.lower():
+            return True
+        return False
+    def decide_device(self):
+        device = self._pytorch_model_config.get("device", "auto")
+        self._device = select_device(device)
+    def load_processor(self):
+        from transformers import AutoProcessor
+        self._processor = AutoProcessor.from_pretrained(self.model_path, use_fast=True)
+        self._tokenizer = self._processor.tokenizer
+    def load_multimodal_model(self):
+        from transformers import Glm4vForConditionalGeneration
+        kwargs = {"device_map": "auto"}
+        kwargs = self.apply_bnb_quantization(kwargs)
+        model = Glm4vForConditionalGeneration.from_pretrained(
+            self.model_path,
+            torch_dtype=torch.bfloat16,
+            **kwargs,
+        )
+        self._model = model.eval()
+        self._device = self._model.device
+    @staticmethod
+    def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
+        res = []
+        for message in messages:
+            role = message["role"]
+            content = message["content"]
+            if isinstance(content, str):
+                res.append({"role": role, "content": content})
+            else:
+                texts = []
+                image_urls = []
+                for c in content:
+                    c_type = c.get("type")
+                    if c_type == "text":
+                        texts.append(c["text"])
+                    else:
+                        assert (
+                            c_type == "image_url"
+                        ), "Please follow the image input of the OpenAI API."
+                        image_urls.append(c["image_url"]["url"])
+                if len(image_urls) > 1:
+                    raise RuntimeError("Only one image per message is supported")
+                image_futures = []
+                with ThreadPoolExecutor() as executor:
+                    for image_url in image_urls:
+                        fut = executor.submit(_decode_image, image_url)
+                        image_futures.append(fut)
+                images = [fut.result() for fut in image_futures]
+                assert len(images) <= 1
+                text = " ".join(texts)
+                if images:
+                    content = [
+                        {"type": "image", "image": images[0]},
+                        {"type": "text", "text": text},
+                    ]
+                    res.append({"role": role, "content": content})
+                else:
+                    res.append(
+                        {"role": role, "content": {"type": "text", "text": text}}
+                    )
+        return res
+    def build_inputs_from_messages(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ):
+        msgs = self._get_processed_msgs(messages)
+        inputs = self._processor.apply_chat_template(
+            msgs,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+            return_dict=True,
+        )  # chat mode
+        inputs = inputs.to(self._model.device)
+        return inputs
+    def get_stop_strs(self) -> List[str]:
+        return ["<|endoftext|>"]
+    def get_builtin_stop_token_ids(self) -> Tuple:
+        from transformers import AutoConfig
+        return tuple(AutoConfig.from_pretrained(self.model_path).eos_token_id)
+    def build_generate_kwargs(
+        self,
+        generate_config: Dict,
+    ) -> Dict[str, Any]:
+        return dict(
+            do_sample=True,
+            top_p=generate_config.get("top_p", 1e-5),
+            repetition_penalty=generate_config.get("repetition_penalty", 1.1),
+            top_k=generate_config.get("top_k", 2),
+            max_new_tokens=generate_config.get("max_tokens", 512),
+        )
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        from transformers import TextIteratorStreamer
+        generate_kwargs = self.build_generate_kwargs(generate_config)
+        inputs = self.build_inputs_from_messages(messages, generate_config)
+        streamer = TextIteratorStreamer(
+            tokenizer=self._tokenizer,
+            timeout=60,
+            skip_prompt=True,
+            skip_special_tokens=False,
+        )
+        kwargs = {
+            **inputs,
+            **generate_kwargs,
+            "streamer": streamer,
+        }
+        logger.debug("Generate with kwargs: %s", generate_kwargs)
+        t = Thread(target=self._model.generate, kwargs=kwargs)
+        t.start()
+        return streamer, len(inputs.input_ids[0])

xinference/model/llm/transformers/multimodal/glm4v.py CHANGED Viewed

@@ -22,7 +22,7 @@ import torch
 from .....core.model import register_batching_multimodal_models
 from .....core.scheduler import InferenceRequest
 from .....model.utils import select_device
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ...utils import _decode_image
 from ..core import register_non_default_model
 from ..utils import get_max_src_len
@@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
 class Glm4VModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
         if "glm-4v" in family.lower():

xinference/model/llm/transformers/multimodal/intern_vl.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import torch
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ...utils import _decode_image, parse_messages
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -35,7 +35,7 @@ class InternVLChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
         if "internvl3" in family.lower():

xinference/model/llm/transformers/multimodal/minicpmv26.py CHANGED Viewed

@@ -22,7 +22,7 @@ from .....core.model import register_batching_multimodal_models
 from .....core.scheduler import InferenceRequest
 from .....model.utils import select_device
 from .....types import PytorchModelConfig
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ...utils import _decode_image, parse_messages
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -33,10 +33,10 @@ logger = logging.getLogger(__name__)
 @register_batching_multimodal_models("MiniCPM-V-2.6")
 @register_transformer
 @register_non_default_model("MiniCPM-V-2.6")
-class Glm4VModel(PytorchMultiModalModel):
+class MiniCPMV26Model(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
         if "MiniCPM-V-2.6".lower() in family.lower():

xinference/model/llm/transformers/multimodal/ovis2.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Any, Dict, Iterator, List, Tuple
 import torch
 from PIL import Image
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -35,7 +35,7 @@ class Ovis2ChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

xinference/model/llm/transformers/multimodal/qwen-omni.py CHANGED Viewed

@@ -27,7 +27,7 @@ from .....types import (
     ChatCompletionChoice,
     CompletionUsage,
 )
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import PytorchGenerateConfig, register_non_default_model
 from .core import PytorchMultiModalModel
@@ -44,7 +44,7 @@ class Qwen2_5OmniChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

xinference/model/llm/transformers/multimodal/qwen2_audio.py CHANGED Viewed

@@ -20,7 +20,7 @@ from urllib.request import urlopen
 import numpy as np
 from .....model.utils import select_device
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -32,7 +32,7 @@ logger = logging.getLogger(__name__)
 class Qwen2AudioChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-audio".lower() in llm_family.lower():

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -20,7 +20,7 @@ from .....core.scheduler import InferenceRequest
 from .....device_utils import is_npu_available
 from .....model.utils import select_device
 from .....types import PytorchModelConfig
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
@@ -46,7 +46,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

xinference/model/llm/transformers/opt.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import List, Optional
 from ....core.scheduler import InferenceRequest
 from ....types import LoRA
-from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from .core import PytorchModel, PytorchModelConfig, register_non_default_model
@@ -26,9 +26,7 @@ class OptPytorchModel(PytorchModel):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
         peft_model: Optional[List[LoRA]] = None,
@@ -36,8 +34,6 @@ class OptPytorchModel(PytorchModel):
         super().__init__(
             model_uid,
             model_family,
-            model_spec,
-            quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
             peft_model=peft_model,
@@ -45,7 +41,7 @@ class OptPytorchModel(PytorchModel):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False

xinference/model/llm/utils.py CHANGED Viewed

@@ -16,7 +16,6 @@ import base64
 import functools
 import json
 import logging
-import os
 import re
 import time
 import typing
@@ -50,13 +49,7 @@ from ...types import (
     CompletionChunk,
     CompletionUsage,
 )
-from .llm_family import (
-    LlamaCppLLMSpecV1,
-    LLMFamilyV1,
-    LLMSpecV1,
-    _get_cache_dir,
-    get_cache_status,
-)
+from .core import chat_context_var
 from .reasoning_parser import ReasoningParser
 logger = logging.getLogger(__name__)
@@ -319,9 +312,7 @@ class ChatModelMixin:
         for i, choice in enumerate(choices):  # type: ignore
             delta = ChatCompletionChunkDelta()
             if "text" in choice and choice["finish_reason"] is None:
-                if not reasoning_parser or not reasoning_parser.check_content_parser():
-                    delta["content"] = choice["text"]
-                else:
+                if reasoning_parser and reasoning_parser.check_content_parser():
                     assert previous_texts is not None
                     current_text = previous_texts[-1] + choice["text"]
                     delta = reasoning_parser.extract_reasoning_content_streaming(
@@ -330,6 +321,8 @@ class ChatModelMixin:
                         delta_text=choice["text"],
                     )
                     previous_texts[-1] = current_text
+                else:
+                    delta["content"] = choice["text"]
             elif "text" in choice and choice["finish_reason"] is not None:
                 delta["content"] = choice["text"]
                 if reasoning_parser and reasoning_parser.check_content_parser():
@@ -463,12 +456,19 @@ class ChatModelMixin:
         cls,
         chunks: AsyncGenerator[CompletionChunk, None],
         reasoning_parser: Optional[ReasoningParser] = None,
+        ctx: Optional[Dict[str, Any]] = None,
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        def set_context():
+            if ctx:
+                chat_context_var.set(ctx)
         previous_texts = [""]
         # Process chunks
         if reasoning_parser:
+            set_context()
             chunks = reasoning_parser.prepare_reasoning_content_streaming(chunks)
         async for chunk in chunks:
+            set_context()
             choices = chunk.get("choices")
             if not choices:
                 # usage
@@ -560,23 +560,33 @@ class ChatModelMixin:
         def split_into_blocks(text: str) -> list[str]:
             # Match blocks starting with <think> or <tool_call> and ending with </think> or </tool_call>
             pattern = r"(<(think|tool_call)>.*?</\2>)"
-            blocks = re.findall(pattern, text, re.DOTALL)
-            return [match[0] for match in blocks]
+            parts = []
+            last_end = 0
+            # Find all label blocks and record their positions
+            for m in re.finditer(pattern, text, re.DOTALL):
+                # Text before adding tags
+                if m.start() > last_end:
+                    parts.append(text[last_end : m.start()])
+                # Add label block
+                parts.append(m.group(0))
+                last_end = m.end()
+            # Text after adding the last tag
+            if last_end < len(text):
+                parts.append(text[last_end:])
+            return parts
         contents = split_into_blocks(text)
         results: List[Tuple] = []
         for content in contents:
-            content = content.strip()
-            if content:
+            if content.strip():
                 pos1 = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
                 if pos1 != -1:
                     content = content[pos1 + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
                 pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
                 if pos2 != -1:
                     content = content[:pos2]
-                content = content.strip()
                 try:
-                    res = json.loads(content)
+                    res = json.loads(content, strict=False)
                     results.append((None, res["name"], res["arguments"]))
                 except Exception as e:
                     logger.error(
@@ -724,7 +734,7 @@ class ChatModelMixin:
                 failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
-        content = ". ".join(failed_contents) if failed_contents else None
+        content = "".join(failed_contents) if failed_contents else None
         # fix: qwen tool_call content field return null
         family = model_family.model_family or model_family.model_name
@@ -802,7 +812,7 @@ class ChatModelMixin:
                     failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
-        content = ". ".join(failed_contents) if failed_contents else None
+        content = "".join(failed_contents) if failed_contents else None
         # fix: qwen tool_call content field return null
         family = model_family.model_family or model_family.model_name
@@ -880,38 +890,13 @@ class ChatModelMixin:
         return transformed_messages
-def get_file_location(
-    llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
-) -> Tuple[str, bool]:
-    cache_dir = _get_cache_dir(
-        llm_family, spec, quantization, create_if_not_exist=False
-    )
-    cache_status = get_cache_status(llm_family, spec, quantization)
-    if isinstance(cache_status, list):
-        is_cached = None
-        for q, cs in zip(spec.quantizations, cache_status):
-            if q == quantization:
-                is_cached = cs
-                break
-    else:
-        is_cached = cache_status
-    assert isinstance(is_cached, bool)
-    if spec.model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
-        return cache_dir, is_cached
-    elif spec.model_format in ["ggufv2"]:
-        assert isinstance(spec, LlamaCppLLMSpecV1)
-        filename = spec.model_file_name_template.format(quantization=quantization)
-        model_path = os.path.join(cache_dir, filename)
-        return model_path, is_cached
-    else:
-        raise ValueError(f"Not supported model format {spec.model_format}")
 def get_model_version(
-    llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
+    model_name: str,
+    model_format: str,
+    model_size_in_billions: Union[str, int],
+    quantization: str,
 ) -> str:
-    return f"{llm_family.model_name}--{llm_spec.model_size_in_billions}B--{llm_spec.model_format}--{quantization}"
+    return f"{model_name}--{model_size_in_billions}B--{model_format}--{quantization}"
 def _decode_image(_url):

xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl