PyPI - xinference - Versions diffs - 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (137) hide show

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import logging
 import platform
 import sys
@@ -160,7 +160,10 @@ class MLXModel(LLM):
     def load(self):
         reasoning_content = self._model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._model_config.pop("enable_thinking", True)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         kwargs = {}
         kwargs["revision"] = self._model_config.get(
@@ -172,7 +175,11 @@ class MLXModel(LLM):
         self._model, self._tokenizer = self._load_model(**kwargs)
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("mlx_lm") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
@@ -423,7 +430,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         return generate_config
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
@@ -445,7 +452,9 @@ class MLXChatModel(MLXModel, ChatModelMixin):
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
-        full_context_kwargs = {}
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {}  # type: ignore
+        )
         if tools:
             if (
                 model_family in QWEN_TOOL_CALL_FAMILY
@@ -476,7 +485,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
 class MLXVisionModel(MLXModel, ChatModelMixin):
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("mlx_vlm") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["mlx"]:
@@ -623,7 +636,10 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         if "internvl2" not in model_family.lower():
             from qwen_vl_utils import process_vision_info
-            full_context_kwargs = {}
+            full_context_kwargs = (
+                self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser)  # type: ignore
+                or {}
+            )
             if tools and model_family in QWEN_TOOL_CALL_FAMILY:
                 full_context_kwargs["tools"] = tools
             assert self.model_family.chat_template is not None

xinference/model/llm/reasoning_parser.py CHANGED Viewed

@@ -1,20 +1,33 @@
 import re
-from typing import Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, Iterator, List, Optional, Tuple, Union
-from ...types import ChatCompletionChunkDelta, CompletionChoice
+from ...types import (
+    ChatCompletionChunk,
+    ChatCompletionChunkDelta,
+    CompletionChoice,
+    CompletionChunk,
+)
 class ReasoningParser:
     """Reasoning parser for reasoning model."""
     def __init__(
-        self, reasoning_start_tag: str = "<think>", reasoning_end_tag: str = "</think>"
+        self,
+        reasoning_content: bool = False,
+        reasoning_start_tag: str = "",
+        reasoning_end_tag: str = "",
+        enable_thinking: bool = True,
     ):
+        self.reasoning_content = reasoning_content
         self.reasoning_start_tag = reasoning_start_tag
         self.reasoning_end_tag = reasoning_end_tag
         self.reasoning_regex = re.compile(
             rf"{self.reasoning_start_tag}(.*?){self.reasoning_end_tag}", re.DOTALL
         )
+        # enable_thinking can be set to False only for hybrid model
+        # e.g. qwen3, which can support both thinking and non-thinking
+        self.enable_thinking = enable_thinking
     def extract_reasoning_content_streaming(
         self,
@@ -62,9 +75,9 @@ class ReasoningParser:
                 delta["content"] = None
                 return delta
         elif self.reasoning_start_tag in delta_text:
+            start_idx = delta_text.find(self.reasoning_start_tag)
             if self.reasoning_end_tag in delta_text:
                 # <think> in delta, </think> in delta, extract reasoning content
-                start_idx = delta_text.find(self.reasoning_start_tag)
                 end_idx = delta_text.find(self.reasoning_end_tag)
                 reasoning_content = delta_text[
                     start_idx + len(self.reasoning_start_tag) : end_idx
@@ -79,7 +92,10 @@ class ReasoningParser:
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
-                delta["reasoning_content"] = delta_text
+                reasoning_content = delta_text[
+                    start_idx + len(self.reasoning_start_tag) :
+                ]
+                delta["reasoning_content"] = reasoning_content
                 delta["content"] = None
                 return delta
         else:
@@ -142,3 +158,263 @@ class ReasoningParser:
             if len(final_output) == 0:
                 return reasoning_content, ""
             return reasoning_content, final_output
+    def check_content_parser(self) -> bool:
+        """Check if the parser should extract reasoning content.
+        Returns:
+            bool: True if reasoning content should be extracted, False otherwise
+        """
+        return self.reasoning_content
+    def _create_chat_completion_chunk(
+        self, chunk: Union[Dict[str, Any], CompletionChunk], content: str
+    ) -> ChatCompletionChunk:
+        """Helper method to create a ChatCompletionChunk with specified content.
+        Args:
+            chunk: The original chunk to copy metadata from
+            content: The content to include in the chunk
+        Returns:
+            ChatCompletionChunk: A new chat completion chunk
+        """
+        return ChatCompletionChunk(
+            id="chat" + chunk["id"],
+            model=chunk["model"],
+            created=chunk["created"],
+            object="chat.completion.chunk",
+            choices=[
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": content,
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        )
+    def _create_completion_chunk(
+        self, chunk: Union[Dict[str, Any], CompletionChunk], text: str
+    ) -> CompletionChunk:
+        """Helper method to create a CompletionChunk with specified text.
+        Args:
+            chunk: The original chunk to copy metadata from
+            text: The text to include in the chunk
+        Returns:
+            CompletionChunk: A new completion chunk
+        """
+        return CompletionChunk(
+            id=chunk["id"],
+            model=chunk["model"],
+            created=chunk["created"],
+            object="text_completion",
+            choices=[
+                {
+                    "index": 0,
+                    "text": text,
+                    "logprobs": None,
+                    "finish_reason": None,
+                }
+            ],
+        )
+    async def prepare_reasoning_content_streaming(
+        self, chunks: AsyncGenerator[CompletionChunk, None]
+    ):
+        """Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
+        if not, add a chunk with the tag at the beginning.
+        Args:
+            chunks (AsyncGenerator[CompletionChunk, None]): Chunks from model output
+        Yields:
+            AsyncGenerator[CompletionChunk, None]: Processed chunks
+        """
+        # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
+        # yield chunks as is
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            async for chunk in chunks:
+                yield chunk
+            return
+        # If chunks is empty, return
+        if not chunks:
+            return
+        # Flag to identify the first chunk
+        is_first_chunk = True
+        async for chunk in chunks:
+            if is_first_chunk:
+                # Reset the flag after processing the first chunk
+                is_first_chunk = False
+                choices = chunk.get("choices")
+                if not choices or not choices[0]:
+                    continue
+                if (
+                    chunk.get("object") == "chat.completion.chunk"
+                    and "delta" in choices[0]
+                ):
+                    # For chat completion chunks with delta format
+                    delta = choices[0].get("delta")
+                    if delta is None:
+                        continue
+                    assert isinstance(delta, dict)
+                    text = delta.get("content")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_chat_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                else:
+                    # For standard completion chunks
+                    text = choices[0].get("text")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                # Yield the original first chunk
+                yield chunk
+            else:
+                # For non-first chunks, yield directly
+                yield chunk
+    def prepare_reasoning_content_sync(self, chunks: Iterator[CompletionChunk]):
+        """Process the chunks from model output, check if the first chunk contains reasoning_start_tag,
+        if not, add a chunk with the tag at the beginning. This is a synchronous version of
+        prepare_reasoning_content_streaming.
+        Args:
+            chunks (Iterator[CompletionChunk]): Chunks from model output
+        Returns:
+            Iterator[CompletionChunk]: Processed chunks
+        """
+        # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
+        # yield chunks as is
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            for chunk in chunks:
+                yield chunk
+            return
+        # Flag to identify the first chunk
+        is_first_chunk = True
+        for chunk in chunks:
+            if is_first_chunk:
+                # Reset the flag after processing the first chunk
+                is_first_chunk = False
+                choices = chunk.get("choices")
+                if not choices or not choices[0]:
+                    continue
+                if (
+                    chunk.get("object") == "chat.completion.chunk"
+                    and "delta" in choices[0]
+                ):
+                    # For chat completion chunks with delta format
+                    delta = choices[0].get("delta")
+                    if delta is None:
+                        continue
+                    assert isinstance(delta, dict)
+                    text = delta.get("content")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_chat_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                else:
+                    # For standard completion chunks
+                    text = choices[0].get("text")
+                    if text is None:
+                        continue
+                    # If the first chunk doesn't contain the reasoning_start_tag
+                    if self.reasoning_start_tag not in text:
+                        # Create and yield chunks with reasoning_start_tag and newline
+                        yield self._create_completion_chunk(
+                            chunk, f"{self.reasoning_start_tag}\n"
+                        )
+                # Yield the original first chunk
+                yield chunk
+            else:
+                # For non-first chunks, yield directly
+                yield chunk
+    def prepare_reasoning_content(self, completion):
+        """Ensures that the model output string starts with the reasoning_start_tag.
+        If the model_output is not a string (e.g., CompletionChoice), it extracts
+        the text content. If the reasoning_start_tag is not found in the text,
+        it prepends the tag to the text.
+        Args:
+            completion: The completion object containing model output,
+                which can be either a chat completion or a standard completion.
+        """
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            return completion
+        if completion.get("object") == "chat.completion" and completion.get("choices"):
+            text = completion["choices"][0]["message"]["content"]
+            if self.reasoning_start_tag not in text:
+                text = f"{self.reasoning_start_tag}\n{text}"
+            completion["choices"][0]["message"]["content"] = text
+            return completion
+        text = completion["choices"][0]["text"]
+        if self.reasoning_start_tag not in text:
+            text = f"{self.reasoning_start_tag}\n{text}"
+        completion["choices"][0]["text"] = text
+        return completion
+    def prepare_first_reasoning_content_chunk(
+        self,
+        chunk: CompletionChunk,
+    ) -> List[ChatCompletionChunk]:
+        """Prepares the first chunk of a completion by adding reasoning_start_tag if needed.
+        This function checks if the first chunk contains the reasoning_start_tag. If not,
+        it creates two new chunks containing the reasoning_start_tag and a newline character
+        that will be inserted before the original chunk.
+        Args:
+            chunk (CompletionChunk): The first chunk of a completion to check and possibly modify
+        Returns:
+            List[ChatCompletionChunk]: A list of new chunks to insert before the original chunk,
+                or an empty list if no modification is needed
+        """
+        chunks: List[ChatCompletionChunk] = []
+        if not self.reasoning_start_tag or not self.enable_thinking:
+            return chunks
+        choices = chunk.get("choices")
+        if not choices or not choices[0]:
+            return chunks
+        text = choices[0].get("text")
+        if not text:
+            return chunks
+        if self.reasoning_start_tag not in text:
+            # Create chunks with reasoning_start_tag and newline
+            chunks.append(
+                self._create_chat_completion_chunk(
+                    chunk, f"{self.reasoning_start_tag}\n"
+                )
+            )
+        return chunks

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.util
 import json
 import logging
 import sys
@@ -101,12 +101,17 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
     "deepseek-v2-chat-0628",
     "qwen2.5-instruct",
     "qwen2.5-coder-instruct",
+    "XiYanSQL-QwenCoder-2504",
     "QwQ-32B-Preview",
     "QwQ-32B",
     "deepseek-r1-distill-qwen",
     "deepseek-r1-distill-llama",
     "deepseek-v3",
     "deepseek-r1",
+    "DianJin-R1",
+    "qwen3",
+    "HuatuoGPT-o1-Qwen2.5",
+    "HuatuoGPT-o1-LLaMA-3.1",
 ]
 SGLANG_SUPPORTED_VISION_MODEL_LIST = [
     "qwen2.5-vl-instruct",
@@ -154,7 +159,10 @@ class SGLANGModel(LLM):
         self._model_config = self._sanitize_model_config(self._model_config)
         reasoning_content = self._model_config.pop("reasoning_content")
-        self.prepare_parse_reasoning_content(reasoning_content)
+        enable_thinking = self._model_config.pop("enable_thinking", False)
+        self.prepare_parse_reasoning_content(
+            reasoning_content, enable_thinking=enable_thinking
+        )
         # Fix: GH#2169
         if sgl.__version__ >= "0.2.14":
@@ -297,7 +305,11 @@ class SGLANGModel(LLM):
         return generate_config
     @classmethod
-    def match(
+    def check_lib(cls) -> bool:
+        return importlib.util.find_spec("sglang") is not None
+    @classmethod
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
@@ -435,6 +447,7 @@ class SGLANGModel(LLM):
     async def async_generate(
         self,
         prompt: str,
+        *,
         image_data: Optional[Union[List[str], str]] = None,
         generate_config: Optional[SGLANGGenerateConfig] = None,
         request_id: Optional[str] = None,
@@ -524,7 +537,7 @@ class SGLANGModel(LLM):
 class SGLANGChatModel(SGLANGModel, ChatModelMixin):
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
@@ -551,6 +564,7 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         if self.model_family.stop:
             if (not generate_config.get("stop")) and self.model_family.stop:
                 generate_config["stop"] = self.model_family.stop.copy()
+        generate_config.pop("chat_template_kwargs", None)
         return generate_config
     async def async_chat(
@@ -560,23 +574,31 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(messages, self.model_family.chat_template)
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
+        full_prompt = self.get_full_context(
+            messages, self.model_family.chat_template, **full_context_kwargs
+        )
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(full_prompt, generate_config)  # type: ignore
+            agen = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
         else:
-            c = await self.async_generate(full_prompt, generate_config)  # type: ignore
+            c = await self.async_generate(full_prompt, generate_config=generate_config)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
             return self._to_chat_completion(c, self.reasoning_parser)
 class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
@@ -627,7 +649,13 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
             self.model_family.chat_template if self.model_family.chat_template else ""
         )
-        prompt = self.get_full_context(messages, chat_template)
+        full_context_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
+        prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
         images, video_inputs = process_vision_info(messages)
         if video_inputs:
             raise ValueError("Not support video input now.")
@@ -650,10 +678,10 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(prompt, base64_images, generate_config)  # type: ignore
+            agen = await self.async_generate(prompt, image_data=base64_images, generate_config=generate_config)  # type: ignore
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
         else:
-            c = await self.async_generate(prompt, base64_images, generate_config)  # type: ignore
+            c = await self.async_generate(prompt, image_data=base64_images, generate_config=generate_config)  # type: ignore
             assert not isinstance(c, AsyncGenerator)
             return self._to_chat_completion(c, self.reasoning_parser)

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -84,7 +84,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         return model, tokenizer
     @classmethod
-    def match(
+    def match_json(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format != "pytorch":
@@ -462,6 +462,12 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     tools = list(tools) if tools is not None else None
                     tool_choice = r.generate_config.get("tool_choice", "none")
+                    full_context_kwargs = (
+                        self._get_chat_template_kwargs_from_generate_config(
+                            r.generate_config, self.reasoning_parser
+                        )
+                        or {}
+                    )
                     r.prompt = self._process_messages(
                         r.prompt, tools=tools, tool_choice=tool_choice
                     )
@@ -469,6 +475,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                         r.prompt,
                         self.model_family.chat_template,  # type: ignore
                         tokenizer=self._tokenizer,
+                        **full_context_kwargs,
                     )
                     if tools:
                         r.tools = tools
@@ -501,7 +508,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         if "<bos_stream>" in req.completion:
             bos_pos = req.completion.index("<bos_stream>")
-            results.append(
+            results.extend(
                 self._get_first_chat_completion_chunk(req.completion[bos_pos + 1])
             )

xinference/model/llm/transformers/cogagent.py CHANGED Viewed

@@ -46,8 +46,8 @@ class CogAgentChatModel(PytorchChatModel):
         self._device = None
         self._tokenizer = None
         self._model = None
-        self._platform: Literal["Mac", "WIN", "Mobile"] | None = "Mac"
-        self._format: Literal[
+        self._platform: Literal["Mac", "WIN", "Mobile"] | None = "Mac"  # type: ignore
+        self._format: Literal[  # type: ignore
             "(Answer in Action-Operation-Sensitive format.)",
             "(Answer in Status-Plan-Action-Operation format.)",
             "(Answer in Status-Action-Operation-Sensitive format.)",
@@ -56,7 +56,7 @@ class CogAgentChatModel(PytorchChatModel):
         ] | None = "(Answer in Action-Operation-Sensitive format.)"
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -64,8 +64,8 @@ class CogAgentChatModel(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
-        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    def load(self):
+        from transformers import AutoModelForCausalLM, AutoTokenizer
         device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(device)
@@ -73,19 +73,14 @@ class CogAgentChatModel(PytorchChatModel):
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path, trust_remote_code=True
         )
-        if self.quantization == "4-bit":
-            quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-        elif self.quantization == "8-bit":
-            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        else:
-            quantization_config = None
+        kwargs = self.apply_bnb_quantization()
         self._model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
             device_map=self._device,
-            quantization_config=quantization_config,
+            **kwargs,
         ).eval()
     def _message_content_to_cogagent(self, content):
@@ -211,6 +206,9 @@ class CogAgentChatModel(PytorchChatModel):
             "return_tensors": "pt",
             "return_dict": True,
         }
+        full_context_kwargs.update(
+            self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {}  # type: ignore
+        )
         assert self.model_family.chat_template is not None
         inputs = self.get_full_context(
             [{"role": "user", "image": image, "content": query}],

xinference/model/llm/transformers/cogvlm2.py CHANGED Viewed

@@ -64,7 +64,7 @@ class CogVLM2Model(PytorchChatModel):
         self._model = None
     @classmethod
-    def match(
+    def match_json(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         family = model_family.model_family or model_family.model_name
@@ -72,7 +72,7 @@ class CogVLM2Model(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers.generation import GenerationConfig
@@ -88,6 +88,8 @@ class CogVLM2Model(PytorchChatModel):
             self._model, self._tokenizer = self._load_tensorizer()
             return
+        kwargs = self.apply_bnb_quantization()
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,
             trust_remote_code=True,
@@ -99,6 +101,7 @@ class CogVLM2Model(PytorchChatModel):
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             device_map="auto",
+            **kwargs
         ).eval()
         # Specify hyperparameters for generation
@@ -313,7 +316,7 @@ class CogVLM2Model(PytorchChatModel):
     def get_dtype(self):
         return self._torch_type
-    def _get_full_prompt(self, messages: List[Dict], tools):
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):  # type: ignore
         prompt, system_prompt, chat_history = parse_messages(messages)
         system_prompt = system_prompt or ""
         query, image, history = self.get_query_and_history(

xinference 1.5.0.post2__py3-none-any.whl → 1.6.0__py3-none-any.whl

Potentially problematic release.

xinference 1.5.0.post2py3-none-any.whl → 1.6.0py3-none-any.whl