PyPI - xinference - Versions diffs - 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

xinference 0.9.4py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show

xinference/model/llm/pytorch/deepseek_vl.py ADDED Viewed

@@ -0,0 +1,232 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import logging
+import os.path
+import tempfile
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from io import BytesIO
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+import requests
+import torch
+from ....model.utils import select_device
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionUsage,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchChatModel, PytorchGenerateConfig
+logger = logging.getLogger(__name__)
+class DeepSeekVLChatModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tokenizer = None
+        self._model = None
+        self._vl_chat_processor = None
+        self._type = None
+    @classmethod
+    def match(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if "deepseek" in model_family.model_name:
+            return True
+        return False
+    def load(self):
+        from transformers import AutoModelForCausalLM
+        from ....thirdparty.deepseek_vl.models import (
+            MultiModalityCausalLM,
+            VLChatProcessor,
+        )
+        self._device = self._pytorch_model_config.get("device", "auto")
+        self._device = select_device(self._device)
+        self._type = torch.float16 if self._device == "mps" else torch.bfloat16
+        # specify the path to the model
+        self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
+            self.model_path
+        )
+        self._tokenizer = self._vl_chat_processor.tokenizer
+        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
+            self.model_path, trust_remote_code=True, device_map=self._device
+        )
+        self._model = vl_gpt.to(self._type).eval()
+    @staticmethod
+    def _message_content_to_deepseek(content) -> Tuple[str, List[str]]:
+        def _ensure_url(_url):
+            if _url.startswith("data:"):
+                logging.info("Parse url by base64 decoder.")
+                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
+                # e.g. f"data:image/jpeg;base64,{base64_image}"
+                _type, data = _url.split(";")
+                _, ext = _type.split("/")
+                data = data[len("base64,") :]
+                data = base64.b64decode(data.encode("utf-8"))
+                with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
+                    f.write(data)
+                logging.info("Dump base64 data to %s", f.name)
+                return f.name
+            else:
+                if len(_url) > 2048:
+                    raise Exception(f"Image url is too long, {len(_url)} > 2048.")
+                return _url
+        def _download(_images):
+            local_images = []
+            # To make requests.get works
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+            }
+            with ThreadPoolExecutor() as executor:
+                for url in images:
+                    try:
+                        if os.path.exists(url):
+                            local_images.append(url)
+                            continue
+                    except Exception as e:
+                        logger.debug("Image is remote: %s, e: %s", url, e)
+                        pass
+                    # Append a placeholder
+                    local_images.append(None)
+                    def _fill_placeholder(_url, _index):
+                        response = requests.get(url, headers=headers)
+                        local_images[_index] = BytesIO(response.content)
+                    executor.submit(_fill_placeholder, url, len(local_images) - 1)
+            return local_images
+        if not isinstance(content, str):
+            # TODO(codingl2k1): Optimize _ensure_url
+            images = []
+            new_content = []
+            for c in content:
+                c_type = c.get("type")
+                if c_type == "image_url":
+                    images.append(_ensure_url(c["image_url"]["url"]))
+                elif c_type == "text":
+                    new_content.append(c["text"])
+            if images:
+                new_content.insert(0, "<image_placeholder>")
+                images = _download(images)
+            return "".join(new_content), images
+        return content, []
+    def chat(
+        self,
+        prompt: Union[str, List[Dict]],
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        if generate_config and generate_config.get("stream"):
+            raise Exception(
+                f"Chat with model {self.model_family.model_name} does not support stream."
+            )
+        prompt, images = self._message_content_to_deepseek(prompt)
+        prompt_messages: List[Dict[str, Any]] = [
+            {
+                "role": "User",
+                "content": prompt,
+            },
+            {"role": "Assistant", "content": ""},
+        ]
+        if images:
+            prompt_messages[0]["images"] = images
+        # Convert openai history to qwen vl history
+        deepseek_history = []
+        for h in chat_history or []:
+            role = h["role"]
+            if role == "user":
+                content, images = self._message_content_to_deepseek(h["content"])
+                msg: Dict[str, Any] = {
+                    "role": "User",
+                    "content": content,
+                }
+                if images:
+                    msg["images"] = images
+                deepseek_history.append(msg)
+            elif role == "assistant":
+                deepseek_history.append({"role": "Assistant", "content": h["content"]})
+            else:
+                logger.error("Unexpected msg in chat history: %s", h)
+        deepseek_history.extend(prompt_messages)
+        from ....thirdparty.deepseek_vl.utils.io import load_pil_images
+        # load images and prepare for inputs
+        pil_images = load_pil_images(deepseek_history)
+        prepare_inputs = self._vl_chat_processor(
+            conversations=deepseek_history, images=pil_images, force_batchify=True
+        ).to(self._model.device, self._model.dtype)
+        # run image encoder to get the image embeddings
+        inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
+        # run the model to get the response
+        outputs = self._model.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=self._tokenizer.eos_token_id,
+            bos_token_id=self._tokenizer.bos_token_id,
+            eos_token_id=self._tokenizer.eos_token_id,
+            max_new_tokens=512,
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.2,
+            repetition_penalty=1.1,
+            use_cache=True,
+        )
+        answer = self._tokenizer.decode(
+            outputs[0].cpu().tolist(), skip_special_tokens=True
+        )
+        return ChatCompletion(
+            id="chat" + str(uuid.uuid1()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message={"role": "assistant", "content": answer},
+                    finish_reason="stop",
+                )
+            ],
+            usage=CompletionUsage(
+                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+            ),
+        )

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -114,6 +114,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             ]
         else:
             input_history = []
+        if system_prompt:
+            kwargs["meta_instruction"] = system_prompt
         if stream:
             def _stream_generator():

xinference/model/llm/pytorch/omnilmm.py ADDED Viewed

@@ -0,0 +1,153 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import json
+import logging
+import operator
+import tempfile
+import time
+import uuid
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+from ....thirdparty.omnilmm.chat import OmniLMMChat, img2base64
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionUsage,
+)
+from ...utils import select_device
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchChatModel, PytorchGenerateConfig
+logger = logging.getLogger(__name__)
+class OmniLMMModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._model = None
+    @classmethod
+    def match(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if "OmniLMM" in model_family.model_name:
+            return True
+        return False
+    def load(self):
+        device = self._pytorch_model_config.get("device", "auto")
+        device = select_device(device)
+        self._model = OmniLMMChat(self.model_path, device_map=device)
+    def _message_content_to_OmniLMM(
+        self, content
+    ) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
+        def _ensure_url(_url):
+            if _url.startswith("data:"):
+                logging.info("Parse url by base64 decoder.")
+                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
+                # e.g. f"data:image/jpeg;base64,{base64_image}"
+                _type, data = _url.split(";")
+                _, ext = _type.split("/")
+                data = data[len("base64,") :]
+                data = base64.b64decode(data.encode("utf-8"))
+                with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
+                    f.write(data)
+                logging.info("Dump base64 data to %s", f.name)
+                return f.name
+            else:
+                if len(_url) > 2048:
+                    raise Exception(f"Image url is too long, {len(_url)} > 2048.")
+                return _url
+        if not isinstance(content, str):
+            images = []
+            other_content = []
+            for c in content:
+                if c.get("type") == "image_url":
+                    images.append(
+                        {"image": _ensure_url(c["image_url"]["url"]), "type": "image"}
+                    )
+                else:
+                    other_content.append(c)
+            images = sorted(images, key=operator.itemgetter("type"))
+            other_content = sorted(other_content, key=operator.itemgetter("type"))
+            return images, other_content
+        return [], [{"type": "text", "text": content}]
+    def chat(
+        self,
+        prompt: Union[str, List[Dict]],
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        if generate_config and generate_config.get("stream"):
+            raise Exception(
+                f"Chat with model {self.model_family.model_name} does not support stream."
+            )
+        image_first, prompt = self._message_content_to_OmniLMM(prompt)
+        msgs = []
+        query_to_response: List[Dict] = []
+        image_another = []
+        for h in chat_history or []:
+            role = h["role"]
+            image_tmp, content = self._message_content_to_OmniLMM(h["content"])
+            if image_tmp != []:
+                image_another = image_tmp
+            if len(query_to_response) == 0 and role == "user":
+                query_to_response.append(
+                    {"role": "user", "content": content[0]["text"]}
+                )
+            if len(query_to_response) == 1 and role == "assistant":
+                query_to_response.append(
+                    {"role": "assistant", "content": content[0]["text"]}
+                )
+            if len(query_to_response) == 2:
+                msgs.extend(query_to_response)
+                query_to_response = []
+        if image_first != []:
+            image = image_first
+        if image_another != []:
+            image = image_another
+        im_64 = img2base64(image[0]["image"])
+        msgs.append({"role": "user", "content": prompt[0]["text"]})
+        input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
+        answer = self._model.chat(input=input)
+        return ChatCompletion(
+            id="chat" + str(uuid.uuid1()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message={"role": "assistant", "content": answer},
+                    finish_reason="stop",
+                )
+            ],
+            usage=CompletionUsage(
+                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+            ),
+        )

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
+        # for multiple GPU, set back to auto to make multiple devices work
+        device = "auto" if device == "cuda" else device
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
         self._device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(self._device)
+        # for multiple GPU, set back to auto to make multiple devices work
+        self._device = "auto" if self._device == "cuda" else self._device
         key_info["model_path"] = self.model_path
         # Default device_map is auto, it can loads model to multiple cards.
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
                 prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
             )
             .unsqueeze(0)
-            .to(self._device)
+            .to(self._model.device)
         )
         images = state.get_images(return_pil=True)
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
             "input_ids": input_ids,
             "images": image_tensor.unsqueeze(0)
             .to(dtype=torch.bfloat16)
-            .to(self._device),
+            .to(self._model.device),
             "streamer": streamer,
             "do_sample": True,
             "top_p": float(top_p),

xinference/model/llm/utils.py CHANGED Viewed

@@ -163,7 +163,7 @@ class ChatModelMixin:
             for i, message in enumerate(chat_history):
                 role = get_role(message["role"])
-                content = message["content"]
+                content = message.get("content")
                 tool_calls = message.get("tool_calls")
                 if tool_calls:
                     content = tool_calls[0]["function"]
@@ -248,7 +248,7 @@ Begin!"""
             ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
             for message in chat_history:
                 role = get_role(message["role"])
-                content = message["content"]
+                content = message.get("content")
                 ret += prompt_style.intra_message_sep
                 if tools:
@@ -421,6 +421,16 @@ Begin!"""
                 else:
                     ret += f"{role}".rstrip()
             return ret
+        elif prompt_style.style_name == "MINICPM-2B":
+            ret = ""
+            for message in chat_history:
+                content = message["content"] or ""
+                role = get_role(message["role"])
+                if role == "user":
+                    ret += "<用户>" + content.strip()
+                else:
+                    ret += "<AI>" + content.strip()
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
@@ -436,6 +446,11 @@ Begin!"""
                     "index": i,
                     "delta": {
                         "content": choice["text"],
+                        **(
+                            {"tool_calls": choice["tool_calls"]}
+                            if "tool_calls" in choice
+                            else {}
+                        ),
                     },
                     "finish_reason": choice["finish_reason"],
                 }
@@ -582,10 +597,9 @@ Begin!"""
         return text, None, None
     @classmethod
-    def _tool_calls_completion(cls, model_family, model_uid, c, tools):
-        _id = str(uuid.uuid4())
+    def _eval_tool_arguments(cls, model_family, c, tools):
         family = model_family.model_family or model_family.model_name
-        if "gorilla-openfunctions-v1" == family:
+        if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
         elif "chatglm3" == family:
             content, func, args = cls._eval_chatglm3_arguments(c, tools)
@@ -596,7 +610,41 @@ Begin!"""
                 f"Model {model_family.model_name} is not support tool calls."
             )
         logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
+        return content, func, args
+    @classmethod
+    def _tools_token_filter(cls, model_family):
+        """
+        Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
+        Returns:
+            A function that takes tokens (string output by the model so far) as input
+            returns True if current token is after "\nFinal Answer:", else False.
+        """
+        family = model_family.model_family or model_family.model_name
+        if family in ["qwen-chat", "qwen1.5-chat"]:
+            # Encapsulating function to reset 'found' after each call
+            found = False
+            def process_token(tokens: str):
+                nonlocal found
+                # Once "Final Answer:" is found, future tokens are allowed.
+                if found:
+                    return True
+                # Check if the token ends with "\nFinal Answer:" and update `found`.
+                if tokens.endswith("\nFinal Answer:"):
+                    found = True
+                return False
+            return process_token
+        else:
+            # For other families, allow all tokens.
+            return lambda tokens: True
+    @classmethod
+    def _tool_calls_completion(cls, model_family, model_uid, c, tools):
+        _id = str(uuid.uuid4())
+        content, func, args = cls._eval_tool_arguments(model_family, c, tools)
         if func:
             m = {
                 "role": "assistant",

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import logging
 import multiprocessing
 import time
@@ -36,6 +37,8 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    ToolCallFunction,
+    ToolCalls,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
@@ -80,25 +83,36 @@ try:
 except ImportError:
     VLLM_INSTALLED = False
-VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k", "mistral-v0.1"]
+VLLM_SUPPORTED_MODELS = [
+    "llama-2",
+    "baichuan",
+    "internlm-16k",
+    "mistral-v0.1",
+    "Yi",
+    "code-llama",
+    "code-llama-python",
+]
 VLLM_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
     "vicuna-v1.3",
     "vicuna-v1.5",
     "baichuan-chat",
+    "baichuan-2-chat",
     "internlm-chat-7b",
     "internlm-chat-8k",
     "internlm-chat-20b",
+    "internlm2-chat",
     "qwen-chat",
-    "Yi",
     "Yi-chat",
-    "code-llama",
-    "code-llama-python",
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
     "mixtral-instruct-v0.1",
     "chatglm3",
+    "chatglm3-32k",
+    "chatglm3-128k",
+    "deepseek-chat",
+    "deepseek-coder-instruct",
 ]
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
@@ -110,6 +124,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
     VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
+if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
 class VLLMModel(LLM):
     def __init__(
@@ -290,6 +307,7 @@ class VLLMModel(LLM):
         self,
         prompt: str,
         generate_config: Optional[Dict] = None,
+        tools: object = False,
     ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
         try:
             from vllm.sampling_params import SamplingParams
@@ -316,16 +334,46 @@ class VLLMModel(LLM):
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
+            tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
             async for _request_output in results_generator:
                 chunk = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
                     model=self.model_uid,
                     request_output=_request_output,
                 )
                 for i, choice in enumerate(chunk["choices"]):
                     delta = choice["text"][len(previous_texts[i]) :]
                     previous_texts[i] = choice["text"]
                     choice["text"] = delta
+                if tools:
+                    # only handle the first choice
+                    choice = chunk["choices"][0]
+                    if choice["finish_reason"] is not None:
+                        # use previous text for evaluation temporarily
+                        choice_delta = choice["text"]
+                        choice["text"] = previous_texts[0]
+                        _content, func, args = ChatModelMixin._eval_tool_arguments(
+                            self.model_family, chunk, tools
+                        )
+                        choice["text"] = choice_delta
+                        if func is not None:
+                            choice["text"] = None
+                            choice["finish_reason"] = "tool_calls"
+                            choice["tool_calls"] = [
+                                ToolCalls(
+                                    id=str(uuid.uuid4()),
+                                    type="function",
+                                    function=ToolCallFunction(
+                                        name=func,
+                                        arguments=json.dumps(args, ensure_ascii=False),
+                                    ),
+                                )
+                            ]
+                    # use a filter function to skip Qwen's react thought process
+                    elif not tools_token_filter(previous_texts[0]):
+                        continue
                 prompt_tokens = len(_request_output.prompt_token_ids)
                 completion_tokens = sum(
                     len(output.token_ids) for output in _request_output.outputs
@@ -413,7 +461,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         generate_config = self._sanitize_chat_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and "qwen-chat" == model_family:
+        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
@@ -426,7 +474,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(full_prompt, generate_config)
+            agen = await self.async_generate(full_prompt, generate_config, tools)
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen)
         else:

xinference/model/rerank/core.py CHANGED Viewed

@@ -134,8 +134,11 @@ class RerankModel:
         top_n: Optional[int],
         max_chunks_per_doc: Optional[int],
         return_documents: Optional[bool],
+        **kwargs,
     ) -> Rerank:
         assert self._model is not None
+        if kwargs:
+            raise ValueError("rerank hasn't support extra parameter.")
         if max_chunks_per_doc is not None:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
         sentence_combinations = [[query, doc] for doc in documents]

xinference 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

xinference 0.9.4py3-none-any.whl → 0.10.1py3-none-any.whl