PyPI - xinference - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

xinference 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (95) hide show

xinference/model/{multimodal → llm/pytorch}/qwen_vl.py RENAMED Viewed

@@ -19,19 +19,21 @@ import time
 import uuid
 from typing import Dict, Iterator, List, Optional, Union
-from ...types import (
+from ....model.utils import select_device
+from ....types import (
     ChatCompletion,
     ChatCompletionChoice,
     ChatCompletionChunk,
+    ChatCompletionMessage,
     CompletionUsage,
 )
-from ..utils import select_device
-from .core import LVLM, LVLMFamilyV1, LVLMSpecV1
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchChatModel, PytorchGenerateConfig
 logger = logging.getLogger(__name__)
-class QwenVLChat(LVLM):
+class QwenVLChatModel(PytorchChatModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._tokenizer = None
@@ -39,7 +41,7 @@ class QwenVLChat(LVLM):
     @classmethod
     def match(
-        cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if "qwen" in model_family.model_name:
             return True
@@ -49,7 +51,7 @@ class QwenVLChat(LVLM):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers.generation import GenerationConfig
-        device = self.kwargs.get("device", "auto")
+        device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
         self._tokenizer = AutoTokenizer.from_pretrained(
@@ -106,8 +108,8 @@ class QwenVLChat(LVLM):
         self,
         prompt: Union[str, List[Dict]],
         system_prompt: Optional[str] = None,
-        chat_history: Optional[List[Dict]] = None,
-        generate_config: Optional[Dict] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         if generate_config and generate_config.get("stream"):
             raise Exception(

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -29,7 +29,12 @@ from transformers.generation.logits_process import (
     TopPLogitsWarper,
 )
-from ....types import CompletionChoice, CompletionChunk, CompletionUsage
+from ....types import (
+    CompletionChoice,
+    CompletionChunk,
+    CompletionUsage,
+    max_tokens_field,
+)
 logger = logging.getLogger(__name__)
@@ -54,16 +59,21 @@ def get_context_length(config):
         hasattr(config, "max_sequence_length")
         and config.max_sequence_length is not None
     ):
-        return config.max_sequence_length
-    elif hasattr(config, "seq_length") and config.seq_length is not None:
-        return config.seq_length
-    elif (
+        max_sequence_length = config.max_sequence_length
+    else:
+        max_sequence_length = 2048
+    if hasattr(config, "seq_length") and config.seq_length is not None:
+        seq_length = config.seq_length
+    else:
+        seq_length = 2048
+    if (
         hasattr(config, "max_position_embeddings")
         and config.max_position_embeddings is not None
     ):
-        return config.max_position_embeddings
+        max_position_embeddings = config.max_position_embeddings
     else:
-        return 2048
+        max_position_embeddings = 2048
+    return max(max_sequence_length, seq_length, max_position_embeddings)
 def prepare_logits_processor(
@@ -102,7 +112,7 @@ def generate_stream(
     repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
     top_p = float(generate_config.get("top_p", 1.0))
     top_k = int(generate_config.get("top_k", -1))  # -1 means disable
-    max_new_tokens = int(generate_config.get("max_tokens", 256))
+    max_new_tokens = int(generate_config.get("max_tokens", max_tokens_field.default))
     echo = bool(generate_config.get("echo", False))
     stop_str = generate_config.get("stop", None)
     stop_token_ids = generate_config.get("stop_token_ids", None) or []
@@ -123,6 +133,8 @@ def generate_stream(
         max_src_len = context_len
     else:
         max_src_len = context_len - max_new_tokens - 8
+        if max_src_len < 0:
+            raise ValueError("Max tokens exceeds model's max length")
     input_ids = input_ids[-max_src_len:]
     input_echo_len = len(input_ids)
@@ -346,7 +358,7 @@ def generate_stream_falcon(
     repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
     top_p = float(generate_config.get("top_p", 1.0))
     top_k = int(generate_config.get("top_k", 50))  # -1 means disable
-    max_new_tokens = int(generate_config.get("max_tokens", 256))
+    max_new_tokens = int(generate_config.get("max_tokens", max_tokens_field.default))
     echo = bool(generate_config.get("echo", False))
     stop_str = generate_config.get("stop", None)
     stop_token_ids = generate_config.get("stop_token_ids", None) or []

xinference/model/llm/pytorch/yi_vl.py ADDED Viewed

@@ -0,0 +1,246 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import logging
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from io import BytesIO
+from threading import Thread
+from typing import Dict, Iterator, List, Optional, Union
+import requests
+import torch
+from PIL import Image
+from ....model.utils import select_device
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionUsage,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchChatModel, PytorchGenerateConfig
+logger = logging.getLogger(__name__)
+class YiVLChatModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tokenizer = None
+        self._model = None
+        self._image_processor = None
+    @classmethod
+    def match(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if "yi" in model_family.model_name:
+            return True
+        return False
+    def load(self):
+        from ....thirdparty.llava.mm_utils import load_pretrained_model
+        from ....thirdparty.llava.model.constants import key_info
+        device = self._pytorch_model_config.get("device", "auto")
+        device = select_device(device)
+        key_info["model_path"] = self.model_path
+        (
+            self._tokenizer,
+            self._model,
+            self._image_processor,
+            _,
+        ) = load_pretrained_model(self.model_path, device_map=device)
+    @staticmethod
+    def _message_content_to_yi(content) -> Union[str, tuple]:
+        def _load_image(_url):
+            if _url.startswith("data:"):
+                logging.info("Parse url by base64 decoder.")
+                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
+                # e.g. f"data:image/jpeg;base64,{base64_image}"
+                _type, data = _url.split(";")
+                _, ext = _type.split("/")
+                data = data[len("base64,") :]
+                data = base64.b64decode(data.encode("utf-8"))
+                return Image.open(BytesIO(data))
+            else:
+                try:
+                    response = requests.get(_url)
+                except requests.exceptions.MissingSchema:
+                    return Image.open(_url)
+                else:
+                    return Image.open(BytesIO(response.content))
+        if not isinstance(content, str):
+            from ....thirdparty.llava.model.constants import DEFAULT_IMAGE_TOKEN
+            texts = []
+            image_urls = []
+            for c in content:
+                c_type = c.get("type")
+                if c_type == "text":
+                    texts.append(c["text"])
+                elif c_type == "image_url":
+                    image_urls.append(c["image_url"]["url"])
+            image_futures = []
+            with ThreadPoolExecutor() as executor:
+                for image_url in image_urls:
+                    fut = executor.submit(_load_image, image_url)
+                    image_futures.append(fut)
+            images = [fut.result() for fut in image_futures]
+            text = " ".join(texts)
+            if DEFAULT_IMAGE_TOKEN not in text:
+                text = DEFAULT_IMAGE_TOKEN + "\n" + text
+            if len(images) == 0:
+                return text
+            elif len(images) == 1:
+                return text, images[0], "Pad"
+            else:
+                raise RuntimeError("Only one image per message is supported by Yi VL.")
+        return content
+    @staticmethod
+    def _parse_text(text):
+        lines = text.split("\n")
+        lines = [line for line in lines if line != ""]
+        count = 0
+        for i, line in enumerate(lines):
+            if "```" in line:
+                count += 1
+                items = line.split("`")
+                if count % 2 == 1:
+                    lines[i] = f'<pre><code class="language-{items[-1]}">'
+                else:
+                    lines[i] = f"<br></code></pre>"
+            else:
+                if i > 0:
+                    if count % 2 == 1:
+                        line = line.replace("`", r"\`")
+                        line = line.replace("<", "&lt;")
+                        line = line.replace(">", "&gt;")
+                        line = line.replace(" ", "&nbsp;")
+                        line = line.replace("*", "&ast;")
+                        line = line.replace("_", "&lowbar;")
+                        line = line.replace("-", "&#45;")
+                        line = line.replace(".", "&#46;")
+                        line = line.replace("!", "&#33;")
+                        line = line.replace("(", "&#40;")
+                        line = line.replace(")", "&#41;")
+                        line = line.replace("$", "&#36;")
+                    lines[i] = "<br>" + line
+        text = "".join(lines)
+        return text
+    def chat(
+        self,
+        prompt: Union[str, List[Dict]],
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        from transformers import TextIteratorStreamer
+        # TODO(codingl2k1): implement stream mode.
+        if generate_config and generate_config.get("stream"):
+            raise Exception(
+                f"Chat with model {self.model_family.model_name} does not support stream."
+            )
+        if not generate_config:
+            generate_config = {}
+        from ....thirdparty.llava.conversation import conv_templates
+        from ....thirdparty.llava.mm_utils import (
+            KeywordsStoppingCriteria,
+            tokenizer_image_token,
+        )
+        from ....thirdparty.llava.model.constants import IMAGE_TOKEN_INDEX
+        # Convert chat history to llava state
+        state = conv_templates["mm_default"].copy()
+        for message in chat_history or []:
+            content = self._message_content_to_yi(message["content"])
+            state.append_message(message["role"], content)
+        state.append_message(state.roles[0], self._message_content_to_yi(prompt))
+        state.append_message(state.roles[1], None)
+        prompt = state.get_prompt()
+        input_ids = (
+            tokenizer_image_token(
+                prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+            )
+            .unsqueeze(0)
+            .cuda()
+        )
+        images = state.get_images(return_pil=True)
+        image = images[0]
+        image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
+            "pixel_values"
+        ][0]
+        stop_str = state.sep
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(
+            keywords, self._tokenizer, input_ids
+        )
+        streamer = TextIteratorStreamer(
+            self._tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True
+        )
+        top_p = generate_config.get("top_p", 0.7)
+        temperature = generate_config.get("temperature", 0.2)
+        max_new_tokens = generate_config.get("max_tokens", 512)
+        generate_kwargs = {
+            "input_ids": input_ids,
+            "images": image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(),
+            "streamer": streamer,
+            "do_sample": True,
+            "top_p": float(top_p),
+            "temperature": float(temperature),
+            "stopping_criteria": [stopping_criteria],
+            "use_cache": True,
+            "max_new_tokens": min(int(max_new_tokens), 1536),
+        }
+        t = Thread(target=self._model.generate, kwargs=generate_kwargs)
+        t.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            if generated_text.endswith(stop_str):
+                generated_text = generated_text[: -len(stop_str)]
+        r = self._parse_text(generated_text)
+        return ChatCompletion(
+            id="chat" + str(uuid.uuid1()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message={"role": "assistant", "content": r},
+                    finish_reason="stop",
+                )
+            ],
+            usage=CompletionUsage(
+                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+            ),
+        )

xinference/model/llm/utils.py CHANGED Viewed

@@ -14,11 +14,10 @@
 import functools
 import json
 import logging
+import os
 import time
 import uuid
-from typing import AsyncGenerator, Dict, Iterator, List, Optional, cast
-from xinference.model.llm.llm_family import PromptStyleV1
+from typing import AsyncGenerator, Dict, Iterator, List, Optional, Tuple, cast
 from ...types import (
     SPECIAL_TOOL_PROMPT,
@@ -28,6 +27,14 @@ from ...types import (
     Completion,
     CompletionChunk,
 )
+from .llm_family import (
+    GgmlLLMSpecV1,
+    LLMFamilyV1,
+    LLMSpecV1,
+    PromptStyleV1,
+    _get_cache_dir,
+    get_cache_status,
+)
 logger = logging.getLogger(__name__)
@@ -303,7 +310,7 @@ Begin!"""
             ret = (
                 "<s>"
                 if prompt_style.system_prompt == ""
-                else "<s>[UNUSED_TOKEN_146]system\n"
+                else "<s><|im_start|>system\n"
                 + prompt_style.system_prompt
                 + prompt_style.intra_message_sep
                 + "\n"
@@ -373,6 +380,20 @@ Begin!"""
                 return f"USER: <<question>> {prompt} <<function>> {tools_string}\nASSISTANT: "
             else:
                 return f"USER: <<question>> {prompt}\nASSISTANT: "
+        elif prompt_style.style_name == "orion":
+            ret = "<s>"
+            for i, message in enumerate(chat_history):
+                content = message["content"]
+                role = message["role"]
+                if i % 2 == 0:  # Human
+                    assert content is not None
+                    ret += role + ": " + content + "\n\n"
+                else:  # Assistant
+                    if content:
+                        ret += role + ": </s>" + content + "</s>"
+                    else:
+                        ret += role + ": </s>"
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
@@ -573,3 +594,35 @@ Begin!"""
                 "total_tokens": -1,
             },
         }
+def get_file_location(
+    llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
+) -> Tuple[str, bool]:
+    cache_dir = _get_cache_dir(llm_family, spec, create_if_not_exist=False)
+    cache_status = get_cache_status(llm_family, spec)
+    if isinstance(cache_status, list):
+        is_cached = None
+        for q, cs in zip(spec.quantizations, cache_status):
+            if q == quantization:
+                is_cached = cs
+                break
+    else:
+        is_cached = cache_status
+    assert isinstance(is_cached, bool)
+    if spec.model_format in ["pytorch", "gptq", "awq"]:
+        return cache_dir, is_cached
+    elif spec.model_format in ["ggmlv3", "ggufv2"]:
+        assert isinstance(spec, GgmlLLMSpecV1)
+        filename = spec.model_file_name_template.format(quantization=quantization)
+        model_path = os.path.join(cache_dir, filename)
+        return model_path, is_cached
+    else:
+        raise ValueError(f"Not supported model format {spec.model_format}")
+def get_model_version(
+    llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
+) -> str:
+    return f"{llm_family.model_name}--{llm_spec.model_size_in_billions}B--{llm_spec.model_format}--{quantization}"

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -95,6 +95,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
+    "mixtral-instruct-v0.1",
     "chatglm3",
 ]
@@ -190,12 +191,12 @@ class VLLMModel(LLM):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if llm_spec.model_format == "gptq":
+        if llm_spec.model_format in ["gptq", "awq"]:
             # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
             if "4" not in quantization:
                 return False
@@ -336,12 +337,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> bool:
         if XINFERENCE_DISABLE_VLLM:
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if llm_spec.model_format == "gptq":
+        if llm_spec.model_format in ["gptq", "awq"]:
             # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
             if "4" not in quantization:
                 return False

xinference/model/rerank/__init__.py CHANGED Viewed

@@ -17,8 +17,20 @@ import json
 import os
 from ...constants import XINFERENCE_MODEL_DIR
-from .core import MODEL_NAME_TO_REVISION, RerankModelSpec, get_cache_status
-from .custom import CustomRerankModelSpec, register_rerank
+from .core import (
+    MODEL_NAME_TO_REVISION,
+    RERANK_MODEL_DESCRIPTIONS,
+    RerankModelSpec,
+    generate_rerank_description,
+    get_cache_status,
+    get_rerank_model_descriptions,
+)
+from .custom import (
+    CustomRerankModelSpec,
+    get_user_defined_reranks,
+    register_rerank,
+    unregister_rerank,
+)
 _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
 _model_spec_modelscope_json = os.path.join(
@@ -30,6 +42,7 @@ BUILTIN_RERANK_MODELS = dict(
 )
 for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
     MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
 MODELSCOPE_RERANK_MODELS = dict(
     (spec["model_name"], RerankModelSpec(**spec))
     for spec in json.load(
@@ -39,6 +52,12 @@ MODELSCOPE_RERANK_MODELS = dict(
 for model_name, model_spec in MODELSCOPE_RERANK_MODELS.items():
     MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+# register model description after recording model revision
+for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
+    for model_name, model_spec in model_spec_info.items():
+        if model_spec.model_name not in RERANK_MODEL_DESCRIPTIONS:
+            RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(model_spec))
 # if persist=True, load them when init
 user_defined_rerank_dir = os.path.join(XINFERENCE_MODEL_DIR, "rerank")
 if os.path.isdir(user_defined_rerank_dir):
@@ -49,5 +68,9 @@ if os.path.isdir(user_defined_rerank_dir):
             user_defined_rerank_spec = CustomRerankModelSpec.parse_obj(json.load(fd))
             register_rerank(user_defined_rerank_spec, persist=False)
+# register model description
+for ud_rerank in get_user_defined_reranks():
+    RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
 del _model_spec_json
 del _model_spec_modelscope_json

xinference/model/rerank/core.py CHANGED Viewed

@@ -36,6 +36,15 @@ MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
 SUPPORTED_SCHEMES = ["s3"]
+RERANK_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
+def get_rerank_model_descriptions():
+    import copy
+    return copy.deepcopy(RERANK_MODEL_DESCRIPTIONS)
 class RerankModelSpec(BaseModel):
     model_name: str
     language: List[str]
@@ -50,8 +59,9 @@ class RerankModelDescription(ModelDescription):
         address: Optional[str],
         devices: Optional[List[str]],
         model_spec: RerankModelSpec,
+        model_path: Optional[str] = None,
     ):
-        super().__init__(address, devices)
+        super().__init__(address, devices, model_path=model_path)
         self._model_spec = model_spec
     def to_dict(self):
@@ -64,6 +74,31 @@ class RerankModelDescription(ModelDescription):
             "model_revision": self._model_spec.model_revision,
         }
+    def to_version_info(self):
+        from .utils import get_model_version
+        if self._model_path is None:
+            is_cached = get_cache_status(self._model_spec)
+            file_location = get_cache_dir(self._model_spec)
+        else:
+            is_cached = True
+            file_location = self._model_path
+        return {
+            "model_version": get_model_version(self._model_spec),
+            "model_file_location": file_location,
+            "cache_status": is_cached,
+            "language": self._model_spec.language,
+        }
+def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[Dict]]:
+    res = defaultdict(list)
+    res[model_spec.model_name].append(
+        RerankModelDescription(None, None, model_spec).to_version_info()
+    )
+    return res
 class RerankModel:
     def __init__(
@@ -71,12 +106,14 @@ class RerankModel:
         model_uid: str,
         model_path: str,
         device: Optional[str] = None,
+        use_fp16: bool = False,
         model_config: Optional[Dict] = None,
     ):
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
         self._model_config = model_config or dict()
+        self._use_fp16 = use_fp16
         self._model = None
     def load(self):
@@ -93,6 +130,8 @@ class RerankModel:
         self._model = CrossEncoder(
             self._model_path, device=self._device, **self._model_config
         )
+        if self._use_fp16:
+            self._model.model.half()
     def rerank(
         self,
@@ -131,6 +170,10 @@ class RerankModel:
         return Rerank(id=str(uuid.uuid1()), results=docs)
+def get_cache_dir(model_spec: RerankModelSpec):
+    return os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name))
 def get_cache_status(
     model_spec: RerankModelSpec,
 ) -> bool:
@@ -145,9 +188,7 @@ def cache_from_uri(
     from ..utils import copy_from_src_to_dst, parse_uri
-    cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
-    )
+    cache_dir = get_cache_dir(model_spec)
     if os.path.exists(cache_dir):
         logger.info(f"Rerank cache {cache_dir} exists")
         return cache_dir
@@ -227,9 +268,7 @@ def cache(model_spec: RerankModelSpec):
         logger.info(f"Rerank model caching from URI: {model_spec.model_uri}")
         return cache_from_uri(model_spec=model_spec)
-    cache_dir = os.path.realpath(
-        os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
-    )
+    cache_dir = get_cache_dir(model_spec)
     if not os.path.exists(cache_dir):
         os.makedirs(cache_dir, exist_ok=True)
     meta_path = os.path.join(cache_dir, "__valid_download")
@@ -312,6 +351,9 @@ def create_rerank_model_instance(
                 )
     model_path = cache(model_spec)
-    model = RerankModel(model_uid, model_path, **kwargs)
-    model_description = RerankModelDescription(subpool_addr, devices, model_spec)
+    use_fp16 = kwargs.pop("use_fp16", False)
+    model = RerankModel(model_uid, model_path, use_fp16=use_fp16, model_config=kwargs)
+    model_description = RerankModelDescription(
+        subpool_addr, devices, model_spec, model_path=model_path
+    )
     return model, model_description

xinference/model/rerank/model_spec.json CHANGED Viewed

@@ -10,5 +10,11 @@
     "language": ["en", "zh"],
     "model_id": "BAAI/bge-reranker-base",
     "model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
+  },
+  {
+    "model_name": "bce-reranker-base_v1",
+    "language": ["en", "zh"],
+    "model_id": "maidalun1020/bce-reranker-base_v1",
+    "model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
   }
 ]

xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

Potentially problematic release.

xinference 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl