PyPI - xinference - Versions diffs - 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl - Mend

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/model/llm/lmdeploy/core.py ADDED Viewed

@@ -0,0 +1,557 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+import uuid
+from typing import AsyncGenerator, Dict, Iterator, List, Optional, TypedDict, Union
+import torch
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionChunkChoice,
+    ChatCompletionMessage,
+    Completion,
+    CompletionChoice,
+    CompletionUsage,
+    LoRA,
+)
+from ..core import LLM
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import ChatModelMixin
+logger = logging.getLogger(__name__)
+try:
+    import lmdeploy  # noqa: F401
+    LMDEPLOY_INSTALLED = True
+except ImportError:
+    LMDEPLOY_INSTALLED = False
+LMDEPLOY_SUPPORTED_CHAT_MODELS = ["internvl2"]
+LMDEPLOY_MODEL_CHAT_TEMPLATE_NAME = {
+    "internvl2": "internvl-internlm2",
+}
+class LMDeployModelConfig(TypedDict, total=False):
+    model_format: Optional[str]
+    tp: Optional[int]
+    session_len: Optional[int]
+    max_batch_size: Optional[int]
+    cache_max_entry_count: Optional[float]
+    cache_block_seq_len: Optional[int]
+    enable_prefix_caching: Optional[bool]
+    quant_policy: Optional[int]
+    rope_scaling_factor: Optional[float]
+    use_logn_attn: Optional[bool]
+    download_dir: Optional[str]
+    revision: Optional[str]
+    max_prefill_token_num: Optional[int]
+    num_tokens_per_iter: Optional[int]
+    max_prefill_iters: Optional[int]
+class LMDeployGenerateConfig(TypedDict, total=False):
+    n: Optional[int]
+    max_new_tokens: Optional[int]
+    top_p: Optional[float]
+    top_k: Optional[int]
+    temperature: Optional[float]
+    repetition_penalty: Optional[float]
+    ignore_eos: Optional[bool]
+    random_seed: Optional[int]
+    stop_words: Optional[List[str]]
+    bad_words: Optional[List[str]]
+    min_new_tokens: Optional[int]
+    skip_special_tokens: Optional[bool]
+    logprobs: Optional[int]
+class LMDeployModel(LLM):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        model_config: Optional[LMDeployModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        self._model_config: LMDeployModelConfig = self._sanitize_model_config(
+            model_config
+        )
+        if peft_model is not None:
+            raise ValueError("LMDEPLOY engine has not supported lora yet.")
+    def _sanitize_model_config(
+        self, model_config: Optional[LMDeployModelConfig]
+    ) -> LMDeployModelConfig:
+        if model_config is None:
+            model_config = LMDeployModelConfig()
+        model_config.setdefault("session_len", 8192)
+        if self.model_spec.model_format == "awq":
+            model_config.setdefault("model_format", "awq")
+        return model_config
+    def load(self):
+        try:
+            import lmdeploy  # noqa: F401, F811
+        except ImportError:
+            error_message = "Failed to import module 'lmdeploy'"
+            installation_guide = [
+                "Please make sure 'lmdeploy' is installed. ",
+                "You can install it by `pip install lmdeploy`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        raise ValueError("LMDEPLOY engine has not supported generate yet.")
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        return False
+    def generate(
+        self,
+        prompt: str,
+        generate_config: Optional[Dict] = None,
+    ) -> Union[Completion, Iterator[ChatCompletionChunk]]:
+        raise NotImplementedError("LMDeploy generate ablility does not support now.")
+class LMDeployChatModel(LMDeployModel, ChatModelMixin):
+    def load(self):
+        try:
+            from lmdeploy import (
+                ChatTemplateConfig,
+                TurbomindEngineConfig,
+                VisionConfig,
+                pipeline,
+            )
+        except ImportError:
+            error_message = "Failed to import module 'lmdeploy'"
+            installation_guide = [
+                "Please make sure 'lmdeploy' is installed. ",
+                "You can install it by `pip install lmdeploy`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        chat_temp_name = ""
+        family = self.model_family.model_family or self.model_family.model_name
+        for key in LMDEPLOY_MODEL_CHAT_TEMPLATE_NAME.keys():
+            if family in key:
+                chat_temp_name = LMDEPLOY_MODEL_CHAT_TEMPLATE_NAME[key]
+                break
+        if chat_temp_name == "":
+            raise ValueError(f"Can not find correct chat template.")
+        chat_template_config = ChatTemplateConfig(chat_temp_name)
+        chat_template_config.meta_instruction = (
+            self.model_family.prompt_style.system_prompt
+        )
+        count = torch.cuda.device_count()
+        if count > 1:
+            self._model_config.setdefault("tp", torch.cuda.device_count())
+        self._model = pipeline(
+            self.model_path,
+            chat_template_config=chat_template_config,
+            backend_config=TurbomindEngineConfig(**self._model_config),
+            vision_config=VisionConfig(thread_safe=True),
+        )
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            if "4" not in quantization:
+                return False
+        if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS:
+            return False
+        return LMDEPLOY_INSTALLED
+    async def async_chat(
+        self,
+        prompt: Union[str, List[Dict]],
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[Dict] = None,
+    ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
+        stream = (
+            generate_config.get("stream", False)
+            if isinstance(generate_config, dict)
+            else False
+        )
+        stream_options = (
+            generate_config.get("stream_options", None)
+            if isinstance(generate_config, dict)
+            else False
+        )
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        chat_history = chat_history or []
+        if stream:
+            chunk = self._chat_stream(prompt, chat_history, include_usage)
+            return self._async_to_chat_completion_chunks(chunk)
+        else:
+            chunk = await self._chat(prompt, chat_history)
+            return self._to_chat_completion(chunk)
+    async def _chat_stream(self, prompt, chat_history, include_usage):
+        from lmdeploy.messages import Response
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        completion_id = str(uuid.uuid1())
+        async for output in self._generate(
+            prompt,
+            chat_history,
+            session_id=-1,
+            stream_response=True,
+        ):
+            new_text = output.text if isinstance(output, Response) else output.response
+            completion_choice = ChatCompletionChunkChoice(
+                text=new_text,
+                index=0,
+                logprobs=None,
+                finish_reason=output.finish_reason,
+            )
+            chunk = ChatCompletionChunk(
+                id=completion_id,
+                object="chat.completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[completion_choice],
+            )
+            prompt_tokens = output.input_token_len
+            completion_tokens = output.generate_token_len
+            total_tokens = prompt_tokens + completion_tokens
+            completion_usage = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            chunk["usage"] = completion_usage
+            print(chunk)
+            yield chunk
+        if include_usage:
+            chunk = ChatCompletionChunk(
+                id=completion_id,
+                object="chat.completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[],
+            )
+            chunk["usage"] = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            yield chunk
+    async def _chat(self, prompt, chat_history):
+        from lmdeploy.messages import Response
+        response, finish_reason = "", ""
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        async for output in self._generate(
+            prompt,
+            chat_history,
+            session_id=-1,
+            stream_response=False,
+        ):
+            response += output.text if isinstance(output, Response) else output.response
+            prompt_tokens = output.input_token_len
+            completion_tokens = output.generate_token_len
+            total_tokens = output.input_token_len + output.generate_token_len
+            finish_reason = output.finish_reason
+        chunk = ChatCompletion(
+            id=str(uuid.uuid1()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[
+                CompletionChoice(
+                    index=0, text=response, finish_reason=finish_reason, logprobs=None
+                )
+            ],
+            usage=CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            ),
+        )
+        return chunk
+    # copy from lmdeploy
+    # Reference: lmdeploy.serve.async_engine.py
+    async def _generate(
+        self,
+        prompt,
+        chat_history,
+        session_id: int,
+        generate_config: Optional[Dict] = None,
+        tools: Optional[List[object]] = None,
+        stream_response: bool = True,
+        sequence_start: bool = True,
+        sequence_end: bool = True,  # no interactive mode by default
+        step: int = 0,
+        do_preprocess: bool = False,
+        adapter_name: Optional[str] = None,
+        **kwargs,
+    ):
+        import random
+        from lmdeploy.messages import EngineGenerationConfig, GenerationConfig
+        from lmdeploy.serve.async_engine import GenOut
+        from lmdeploy.tokenizer import DetokenizeState
+        session_id = -1
+        if str(session_id) not in self._model.id2step:
+            self._model.id2step[str(session_id)] = 0
+        if generate_config is None:
+            generate_config = GenerationConfig()
+        if type(generate_config) is GenerationConfig:
+            generate_config = EngineGenerationConfig.From(
+                generate_config, self._model.tokenizer
+            )
+        if generate_config.stop_words is None:  # type: ignore
+            generate_config.stop_words = self._model.stop_words  # type: ignore
+        if generate_config.random_seed is None and sequence_start:  # type: ignore
+            generate_config.random_seed = random.getrandbits(64)  # type: ignore
+        if generate_config.n > 1:  # type: ignore
+            logger.warning(
+                f"n({generate_config.n}) > 1 hasn't been supported yet. "  # type: ignore
+                f"Fallback to 1"
+            )
+            generate_config.n = 1  # type: ignore
+        prompt_input = await self._get_prompt_input(prompt, chat_history)
+        prompt = prompt_input["prompt"]
+        input_ids = prompt_input["input_ids"]
+        finish_reason = None
+        logger.info(
+            f"prompt={prompt!r}, "
+            f"gen_config={generate_config}, "
+            f"prompt_token_id={input_ids}, "
+            f"adapter_name={adapter_name}."
+        )
+        logger.info(
+            f"session_id={session_id}, "  # type: ignore
+            f"history_tokens={self._model.id2step[str(session_id)]}, "
+            f"input_tokens={len(input_ids)}, "
+            f"max_new_tokens={generate_config.max_new_tokens}, "
+            f"seq_start={sequence_start}, seq_end={sequence_end}, "
+            f"step={step}, prep={do_preprocess}"
+        )
+        if generate_config.max_new_tokens is None:  # type: ignore
+            # for interactive endpoint, will try maximum possible token num
+            generate_config.max_new_tokens = max(  # type: ignore
+                128,
+                self._model.session_len
+                - self._model.id2step[str(session_id)]
+                - len(input_ids),
+            )
+        elif (
+            self._model.id2step[str(session_id)]
+            + len(input_ids)
+            + generate_config.max_new_tokens  # type: ignore
+            > self._model.session_len
+        ):
+            generate_config.max_new_tokens = max(  # type: ignore
+                self._model.session_len
+                - self._model.id2step[str(session_id)]
+                - len(input_ids),
+                128,
+            )
+            logger.error(f"Truncate max_new_tokens to {generate_config.max_new_tokens}")  # type: ignore
+        if (
+            self._model.id2step[str(session_id)]
+            + len(input_ids)
+            + generate_config.max_new_tokens  # type: ignore
+            > self._model.session_len
+        ):
+            logger.error(f"run out of tokens. session_id={session_id}.")
+            yield GenOut(
+                "", self._model.id2step[str(session_id)], len(input_ids), 0, "length"
+            )
+            if sequence_end is True and sequence_start is False:
+                await self._model.end_session(session_id)
+        else:
+            generator = await self._model.get_generator(False, session_id)
+            async with self._model.safe_run(session_id):
+                state = DetokenizeState(len(input_ids))
+                start_ids_offset = state.ids_offset
+                response = ""
+                async for outputs in generator.async_stream_infer(
+                    session_id=session_id,
+                    **prompt_input,
+                    gen_config=generate_config,
+                    adapter_name=adapter_name,
+                    stream_output=stream_response,
+                    sequence_start=sequence_start,
+                    sequence_end=sequence_end,
+                    step=self._model.id2step[str(session_id)],
+                ):
+                    # decode res
+                    res, tokens = (
+                        input_ids + outputs.token_ids,
+                        outputs.num_token,
+                    )  # noqa
+                    if len(res) <= state.ids_offset:
+                        continue
+                    ids_offset = state.ids_offset
+                    response, state = self._model.tokenizer.detokenize_incrementally(
+                        res,
+                        state,
+                        skip_special_tokens=generate_config.skip_special_tokens,  # type: ignore
+                    )
+                    res = res[ids_offset:]
+                    logprobs = None
+                    if outputs.logprobs:
+                        log_offset = ids_offset - start_ids_offset
+                        logprobs = outputs.logprobs[log_offset:]
+                    # response, history token len,
+                    # input token len, gen token len
+                    yield GenOut(
+                        response,
+                        self._model.id2step[str(session_id)],
+                        len(input_ids),
+                        tokens,
+                        finish_reason,
+                        res,
+                        logprobs,
+                    )
+                finish_reason = (
+                    "length" if tokens >= generate_config.max_new_tokens else "stop"  # type: ignore
+                )
+                # utf-8 char at the end means it's a potential unfinished
+                # byte sequence
+                if not response.endswith("�"):
+                    response = ""  # avaid returning the last response twice
+                yield GenOut(
+                    response,
+                    self._model.id2step[str(session_id)],
+                    len(input_ids),
+                    tokens,
+                    finish_reason,
+                )
+                # update step
+                self._model.id2step[str(session_id)] += len(input_ids) + tokens
+                if sequence_end:
+                    self._model.id2step[str(session_id)] = 0
+                # manually end pytorch session
+                # TODO modify pytorch or turbomind api
+                if self._model.backend == "pytorch" and sequence_end:
+                    await self._model.end_session(session_id)
+    # copy from lmdeploy
+    # Reference: lmdeploy.serve.vl_async_engine.py
+    async def _get_prompt_input(
+        self,
+        prompt: Union[str, List[Dict]],
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        sequence_start: bool = True,
+        tools: Optional[List[object]] = None,
+        **kwargs,
+    ):
+        """get input_ids, embeddings and offsets."""
+        IMAGE_TOKEN = "<IMAGE_TOKEN>"
+        IMAGE_DUMMY_TOKEN_INDEX = 0
+        import numpy as np
+        assert self.model_family.prompt_style is not None
+        prompt_style = self.model_family.prompt_style.copy()
+        chat_history = chat_history or []
+        decorated, _ = self.get_prompt(prompt, chat_history, prompt_style)  # type: ignore
+        chat_history.append(ChatCompletionMessage(role="user", content=prompt))  # type: ignore
+        prompt = chat_history  # type: ignore
+        decorated = decorated.replace("<image>", "<img><IMAGE_TOKEN></img>")
+        segs = decorated.split(IMAGE_TOKEN)
+        results = {}
+        input_ids = []  # type: ignore
+        if len(segs) > 1:
+            images = await self._model.vl_prompt_template.async_collect_pil_images(
+                prompt
+            )
+            features = await self._model.vl_encoder.async_infer(images)
+            from lmdeploy.vl.templates import MiniCPMVTempateWrapper
+            if isinstance(self._model.vl_prompt_template, MiniCPMVTempateWrapper):
+                (
+                    decorated,
+                    features,
+                ) = self._model.vl_prompt_template.update_image_token(  # noqa: E501
+                    decorated, features
+                )
+                segs = decorated.split(IMAGE_TOKEN)
+            features = [x.cpu().numpy() for x in features]
+            input_ids = []
+            begins = []
+            ends = []
+            if len(segs) != len(features) + 1:
+                logger.error(
+                    f"the number of {IMAGE_TOKEN} is not equal "
+                    f"to input images, {len(segs) - 1} vs {len(features)}"
+                )
+                features = features[: len(segs) - 1]
+            for i, seg in enumerate(segs):
+                if i > 0 and i <= len(features):
+                    image_dim = features[i - 1].shape[0]
+                    begins.append(len(input_ids))
+                    ends.append(begins[-1] + image_dim)
+                    input_ids.extend([IMAGE_DUMMY_TOKEN_INDEX] * image_dim)
+                seg_ids = self._model.tokenizer.encode(
+                    seg, add_bos=((i == 0) and sequence_start)
+                )
+                input_ids.extend(seg_ids)
+            ranges = np.stack([begins, ends], axis=1).tolist()
+            results["input_embeddings"] = features
+            results["input_embedding_ranges"] = ranges
+        else:
+            input_ids = self._model.tokenizer.encode(decorated, add_bos=sequence_start)
+        results["input_ids"] = input_ids
+        results["prompt"] = decorated
+        return results

xinference/model/llm/memory.py CHANGED Viewed

@@ -61,7 +61,7 @@ class ModelMemInfo:
 QUANT_NORMALIZE = {"int4": "4-bit", "int8": "8-bit", "4-bit": "4-bit", "8-bit": "8-bit"}
-GGML_MULTI_FACTOR_DICT = {
+GGUF_MULTI_FACTOR_DICT = {
     "q4_0": 18,
     "q4_1": 20,
     "q5_0": 22,
@@ -70,14 +70,14 @@ GGML_MULTI_FACTOR_DICT = {
     "q8_1": 40,
 }
-GGML_MULTI_FACTOR_DICT_64 = {
+GGUF_MULTI_FACTOR_DICT_64 = {
     "q6_K": 54.0,
     "q3": 26.0,
     "q4": 38.0,
     "q5": 46.0,
 }
-GGML_MULTI_FACTOR_DICT_COMBINE = {
+GGUF_MULTI_FACTOR_DICT_COMBINE = {
     "q3_K_L": [38.0, 26.0],
     "q3_K_M": [46.0, 26.0],
     "q4_K_S": [46.0, 38.0],
@@ -136,9 +136,9 @@ def estimate_llm_gpu_memory_details(
     else:
         kv_dtype_size = 4
     overhead = 650.0
-    if model_format == "ggmlv3":
+    if model_format == "ggufv2":
         assert quantization is not None and quantization != "none"
-        model_size_in_mb = _compute_model_size_ggml(info, quantization)
+        model_size_in_mb = _compute_model_size_gguf(info, quantization)
         inference_mem = float(
             context_length * kv_dtype_size * info.hidden_dim * info.num_layers
         )
@@ -291,7 +291,7 @@ def _compute_inference_only_activation_memory(
     return ret
-def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
+def _compute_model_size_gguf(info: ModelLayersInfo, quantization: str) -> float:
     assert quantization is not None
     vocab_size = info.vocab_size
     num_layers = info.num_layers
@@ -310,13 +310,13 @@ def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
     )
     total = 0.0
-    v1 = GGML_MULTI_FACTOR_DICT.get(quantization)
+    v1 = GGUF_MULTI_FACTOR_DICT.get(quantization)
     if v1 is not None:
         total = (v1 * total_params) / (32 * 1024 * 1024)
-    v2 = GGML_MULTI_FACTOR_DICT_64.get(quantization)
+    v2 = GGUF_MULTI_FACTOR_DICT_64.get(quantization)
     if v2 is not None:
         total = (v2 * total_params) / (64 * 1024 * 1024)
-    v3 = GGML_MULTI_FACTOR_DICT_COMBINE.get(quantization)
+    v3 = GGUF_MULTI_FACTOR_DICT_COMBINE.get(quantization)
     if v3 is not None:
         factors = v3
         if quantization == "q2_K":

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -189,7 +189,7 @@ class SGLANGModel(LLM):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -378,7 +378,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):

xinference/model/llm/{pytorch → transformers}/chatglm.py RENAMED Viewed

@@ -344,7 +344,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         return kwargs, tools
     @torch.inference_mode()
-    def stream_chat(
+    def _stream_chat(
         self,
         tokenizer,
         query: str,
@@ -399,7 +399,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                 yield new_response, new_history
     @torch.inference_mode()
-    def non_stream_chat(
+    def _non_stream_chat(
         self,
         tokenizer,
         query: str,
@@ -475,10 +475,6 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         if stream and (
             not tools or self.model_family.model_name in GLM4_TOOL_CALL_FAMILY
         ):
-            if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
-                stream_chat = self.stream_chat
-            else:
-                stream_chat = self._model.stream_chat
             def _stream_generator():
                 last_chunk_text_length = 0
@@ -487,7 +483,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                 inputs = self._tokenizer([prompt], return_tensors="pt")
                 inputs = inputs.to(self._model.device)
                 prompt_tokens = len(inputs["input_ids"][0])
-                for chunk_text, _ in stream_chat(
+                for chunk_text, _ in self._stream_chat(
                     self._tokenizer, prompt, chat_history, **kwargs
                 ):
                     if tools and isinstance(chunk_text, dict):
@@ -548,12 +544,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             return self._to_chat_completion_chunks(_stream_generator())
         else:
-            if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
-                chat = self.non_stream_chat
-            else:
-                chat = self._model.chat
-            response = chat(self._tokenizer, prompt, chat_history, **kwargs)
+            response = self._non_stream_chat(
+                self._tokenizer, prompt, chat_history, **kwargs
+            )
             if tools:
                 return self._tool_calls_completion(
                     self.model_family, self.model_uid, response, tools

xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

Potentially problematic release.

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl