PyPI - xinference - Versions diffs - 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (136) hide show

xinference/_version.py +3 -3
xinference/client/restful/async_restful_client.py +8 -13
xinference/client/restful/restful_client.py +6 -2
xinference/core/chat_interface.py +6 -4
xinference/core/media_interface.py +5 -0
xinference/core/model.py +1 -5
xinference/core/supervisor.py +117 -68
xinference/core/worker.py +49 -37
xinference/deploy/test/test_cmdline.py +2 -6
xinference/model/audio/__init__.py +26 -23
xinference/model/audio/chattts.py +3 -2
xinference/model/audio/core.py +49 -98
xinference/model/audio/cosyvoice.py +3 -2
xinference/model/audio/custom.py +28 -73
xinference/model/audio/f5tts.py +3 -2
xinference/model/audio/f5tts_mlx.py +3 -2
xinference/model/audio/fish_speech.py +3 -2
xinference/model/audio/funasr.py +17 -4
xinference/model/audio/kokoro.py +3 -2
xinference/model/audio/megatts.py +3 -2
xinference/model/audio/melotts.py +3 -2
xinference/model/audio/model_spec.json +572 -171
xinference/model/audio/utils.py +0 -6
xinference/model/audio/whisper.py +3 -2
xinference/model/audio/whisper_mlx.py +3 -2
xinference/model/cache_manager.py +141 -0
xinference/model/core.py +6 -49
xinference/model/custom.py +174 -0
xinference/model/embedding/__init__.py +67 -56
xinference/model/embedding/cache_manager.py +35 -0
xinference/model/embedding/core.py +104 -84
xinference/model/embedding/custom.py +55 -78
xinference/model/embedding/embed_family.py +80 -31
xinference/model/embedding/flag/core.py +21 -5
xinference/model/embedding/llama_cpp/__init__.py +0 -0
xinference/model/embedding/llama_cpp/core.py +234 -0
xinference/model/embedding/model_spec.json +968 -103
xinference/model/embedding/sentence_transformers/core.py +30 -20
xinference/model/embedding/vllm/core.py +11 -5
xinference/model/flexible/__init__.py +8 -2
xinference/model/flexible/core.py +26 -119
xinference/model/flexible/custom.py +69 -0
xinference/model/flexible/launchers/image_process_launcher.py +1 -0
xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
xinference/model/flexible/launchers/transformers_launcher.py +15 -3
xinference/model/flexible/launchers/yolo_launcher.py +5 -1
xinference/model/image/__init__.py +20 -20
xinference/model/image/cache_manager.py +62 -0
xinference/model/image/core.py +70 -182
xinference/model/image/custom.py +28 -72
xinference/model/image/model_spec.json +402 -119
xinference/model/image/ocr/got_ocr2.py +3 -2
xinference/model/image/stable_diffusion/core.py +22 -7
xinference/model/image/stable_diffusion/mlx.py +6 -6
xinference/model/image/utils.py +2 -2
xinference/model/llm/__init__.py +71 -94
xinference/model/llm/cache_manager.py +292 -0
xinference/model/llm/core.py +37 -111
xinference/model/llm/custom.py +88 -0
xinference/model/llm/llama_cpp/core.py +5 -7
xinference/model/llm/llm_family.json +16260 -8151
xinference/model/llm/llm_family.py +138 -839
xinference/model/llm/lmdeploy/core.py +5 -7
xinference/model/llm/memory.py +3 -4
xinference/model/llm/mlx/core.py +6 -8
xinference/model/llm/reasoning_parser.py +3 -1
xinference/model/llm/sglang/core.py +32 -14
xinference/model/llm/transformers/chatglm.py +3 -7
xinference/model/llm/transformers/core.py +49 -27
xinference/model/llm/transformers/deepseek_v2.py +2 -2
xinference/model/llm/transformers/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
xinference/model/llm/transformers/opt.py +3 -7
xinference/model/llm/utils.py +34 -49
xinference/model/llm/vllm/core.py +77 -27
xinference/model/llm/vllm/xavier/engine.py +5 -3
xinference/model/llm/vllm/xavier/scheduler.py +10 -6
xinference/model/llm/vllm/xavier/transfer.py +1 -1
xinference/model/rerank/__init__.py +26 -25
xinference/model/rerank/core.py +47 -87
xinference/model/rerank/custom.py +25 -71
xinference/model/rerank/model_spec.json +158 -33
xinference/model/rerank/utils.py +2 -2
xinference/model/utils.py +115 -54
xinference/model/video/__init__.py +13 -17
xinference/model/video/core.py +44 -102
xinference/model/video/diffusers.py +4 -3
xinference/model/video/model_spec.json +90 -21
xinference/types.py +5 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
xinference/web/ui/src/locales/en.json +0 -1
xinference/web/ui/src/locales/ja.json +0 -1
xinference/web/ui/src/locales/ko.json +0 -1
xinference/web/ui/src/locales/zh.json +0 -1
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
xinference/model/audio/model_spec_modelscope.json +0 -231
xinference/model/embedding/model_spec_modelscope.json +0 -293
xinference/model/embedding/utils.py +0 -18
xinference/model/image/model_spec_modelscope.json +0 -375
xinference/model/llm/llama_cpp/memory.py +0 -457
xinference/model/llm/llm_family_csghub.json +0 -56
xinference/model/llm/llm_family_modelscope.json +0 -8700
xinference/model/llm/llm_family_openmind_hub.json +0 -1019
xinference/model/rerank/model_spec_modelscope.json +0 -85
xinference/model/video/model_spec_modelscope.json +0 -184
xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
/xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.7.1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -50,9 +50,9 @@ from ....types import (
     CompletionUsage,
     LoRA,
 )
-from .. import LLM, LLMFamilyV1, LLMSpecV1
+from .. import BUILTIN_LLM_FAMILIES, LLM, LLMFamilyV2, LLMSpecV1
 from ..core import chat_context_var
-from ..llm_family import CustomLLMFamilyV1, cache_model_tokenizer_and_config
+from ..llm_family import CustomLLMFamilyV2, cache_model_tokenizer_and_config
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
     QWEN_TOOL_CALL_FAMILY,
@@ -117,6 +117,11 @@ class VLLMGenerateConfig(TypedDict, total=False):
 try:
     import vllm  # noqa: F401
+    if not getattr(vllm, "__version__", None):
+        raise ImportError(
+            "vllm not installed properly, or wrongly be found in sys.path"
+        )
     VLLM_INSTALLED = True
 except ImportError:
     VLLM_INSTALLED = False
@@ -257,14 +262,16 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
 if VLLM_INSTALLED and vllm.__version__ >= "0.9.1":
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")
+if VLLM_INSTALLED and vllm.__version__ >= "0.9.2":
+    VLLM_SUPPORTED_CHAT_MODELS.append("Ernie4.5")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.1v-thinking")
 class VLLMModel(LLM):
     def __init__(
         self,
         model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
+        model_family: "LLMFamilyV2",
         model_path: str,
         model_config: Optional[VLLMModelConfig],
         peft_model: Optional[List[LoRA]] = None,
@@ -279,7 +286,7 @@ class VLLMModel(LLM):
             ]
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        super().__init__(model_uid, model_family, model_path)
         self._model_config = model_config
         self._engine = None
         self.lora_modules = peft_model
@@ -349,7 +356,7 @@ class VLLMModel(LLM):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        from ..llm_family import LlamaCppLLMSpecV1
+        from ..llm_family import LlamaCppLLMSpecV2
         if "0.3.1" <= vllm.__version__ <= "0.3.3":
             # from vllm v0.3.1 to v0.3.3, it uses cupy as NCCL backend
@@ -368,7 +375,7 @@ class VLLMModel(LLM):
         )
         if (
-            isinstance(self.model_spec, LlamaCppLLMSpecV1)
+            isinstance(self.model_spec, LlamaCppLLMSpecV2)
             and self.model_spec.model_format == "ggufv2"
         ):
             # gguf
@@ -592,20 +599,25 @@ class VLLMModel(LLM):
         if "tokenizer" not in self._model_config:
             # find pytorch format without quantization
+            family = next(
+                family
+                for family in BUILTIN_LLM_FAMILIES
+                if family.model_name == self.model_family.model_name
+            ).copy()
             non_quant_spec = next(
                 spec
-                for spec in self.model_family.model_specs
-                if spec.model_format == "pytorch"
-                and "none" in spec.quantizations
+                for spec in family.model_specs
+                if spec.quantization == "none"
                 and spec.model_size_in_billions
                 == self.model_spec.model_size_in_billions
+                and spec.model_hub == self.model_spec.model_hub
             )
-            path = cache_model_tokenizer_and_config(self.model_family, non_quant_spec)
+            family.model_specs = [non_quant_spec]
+            path = cache_model_tokenizer_and_config(family)
             # other than gguf file, vllm requires to provide tokenizer and hf_config_path
-            self._model_config["tokenizer"] = self._model_config[
-                "hf_config_path"
-            ] = path
+            self._model_config["tokenizer"] = self._model_config["hf_config_path"] = (
+                path
+            )
         if not os.path.isfile(self.model_path):
             self.model_path = os.path.realpath(
@@ -791,7 +803,7 @@ class VLLMModel(LLM):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
             return False
@@ -813,7 +825,7 @@ class VLLMModel(LLM):
             else:
                 if "4" not in quantization:
                     return False
-        if isinstance(llm_family, CustomLLMFamilyV1):
+        if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
                 return False
         else:
@@ -1090,7 +1102,7 @@ class VLLMModel(LLM):
 class VLLMChatModel(VLLMModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
             return False
@@ -1111,7 +1123,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         if llm_spec.model_format == "ggufv2":
             if not (VLLM_INSTALLED and vllm.__version__ >= "0.8.2"):
                 return False
-        if isinstance(llm_family, CustomLLMFamilyV1):
+        if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
                 return False
         else:
@@ -1137,9 +1149,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                 not generate_config.get("stop_token_ids")
                 and self.model_family.stop_token_ids
             ):
-                generate_config[
-                    "stop_token_ids"
-                ] = self.model_family.stop_token_ids.copy()
+                generate_config["stop_token_ids"] = (
+                    self.model_family.stop_token_ids.copy()
+                )
         return generate_config
     @staticmethod
@@ -1150,17 +1162,50 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def is_tool_call_chunk_end(chunk):
         return chunk["choices"][0]["text"].endswith(QWEN_TOOL_CALL_SYMBOLS[1])
+    @staticmethod
+    def prefill_messages(messages: List[Dict]) -> List[Dict]:
+        """
+        Preprocess messages to ensure content is not None
+        Args:
+            messages: Original message list
+        Returns:
+            Processed message list, where content is not None
+        """
+        processed_messages = []
+        for msg in messages:
+            if isinstance(msg, dict):
+                if msg.get("content") is None:
+                    msg_copy = msg.copy()
+                    msg_copy["content"] = ""  # Replace None with empty string
+                    processed_messages.append(msg_copy)
+                else:
+                    processed_messages.append(msg)
+            else:
+                processed_messages.append(msg)
+        return processed_messages
     async def _async_to_tool_completion_chunks(
         self,
         chunks: AsyncGenerator[CompletionChunk, None],
+        ctx: Optional[Dict[str, Any]] = {},
     ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        def set_context():
+            if ctx:
+                chat_context_var.set(ctx)
         i = 0
         previous_texts = [""]
         tool_call = False
         tool_call_texts = [""]
         if self.reasoning_parser:
+            set_context()
             chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
         async for chunk in chunks:
+            set_context()
             if i == 0:
                 for first_chunk in self._get_first_chat_completion_chunk(
                     chunk, self.reasoning_parser
@@ -1200,6 +1245,9 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         generate_config: Optional[Dict] = None,
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
+        # Preprocess messages to ensure content is not None
+        messages = self.prefill_messages(messages)
         tools = generate_config.pop("tools", []) if generate_config else None
         model_family = self.model_family.model_family or self.model_family.model_name
         chat_template_kwargs = (
@@ -1230,8 +1278,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             )
             assert isinstance(agen, AsyncGenerator)
             if tools:
-                return self._async_to_tool_completion_chunks(agen)
-            return self._async_to_chat_completion_chunks(agen, self.reasoning_parser)
+                return self._async_to_tool_completion_chunks(agen, chat_template_kwargs)
+            return self._async_to_chat_completion_chunks(
+                agen, self.reasoning_parser, chat_template_kwargs
+            )
         else:
             c = await self.async_generate(
                 full_prompt, generate_config, request_id=request_id
@@ -1247,7 +1297,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
 class VLLMVisionModel(VLLMModel, ChatModelMixin):
     @classmethod
     def match_json(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if not cls._has_cuda_device():
             return False
@@ -1269,7 +1319,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             else:
                 if "4" not in quantization:
                     return False
-        if isinstance(llm_family, CustomLLMFamilyV1):
+        if isinstance(llm_family, CustomLLMFamilyV2):
             if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
                 return False
         else:

xinference/model/llm/vllm/xavier/engine.py CHANGED Viewed

@@ -39,9 +39,11 @@ class XavierInternalEngine(_AsyncLLMEngine):
                 self.cache_config,
                 self.lora_config,
                 self.parallel_config.pipeline_parallel_size,
-                self.async_callbacks[v_id]
-                if self.model_config.use_async_output_proc
-                else None,
+                (
+                    self.async_callbacks[v_id]
+                    if self.model_config.use_async_output_proc
+                    else None
+                ),
                 xavier_config=self._xavier_config,
                 virtual_engine=v_id,
             )

xinference/model/llm/vllm/xavier/scheduler.py CHANGED Viewed

@@ -352,12 +352,16 @@ class XavierScheduler(Scheduler):
                     # between engine and worker.
                     # the subsequent comms can still use delta, but
                     # `multi_modal_data` will be None.
-                    multi_modal_data=seq_group.multi_modal_data
-                    if scheduler_outputs.num_prefill_groups > 0
-                    else None,
-                    multi_modal_placeholders=seq_group.multi_modal_placeholders
-                    if scheduler_outputs.num_prefill_groups > 0
-                    else None,
+                    multi_modal_data=(
+                        seq_group.multi_modal_data
+                        if scheduler_outputs.num_prefill_groups > 0
+                        else None
+                    ),
+                    multi_modal_placeholders=(
+                        seq_group.multi_modal_placeholders
+                        if scheduler_outputs.num_prefill_groups > 0
+                        else None
+                    ),
                     mm_processor_kwargs=seq_group.mm_processor_kwargs,
                     prompt_adapter_request=seq_group.prompt_adapter_request,
                 )

xinference/model/llm/vllm/xavier/transfer.py CHANGED Viewed

@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
 class BufferTransferMixin:
     def __init__(self):
-        self.num_buffer: int = 0
+        self.num_buffer: int = 0  # type: ignore
         self.buffers: List[torch.Tensor] = []  # type: ignore
         self.buffer_queue: Optional[Queue] = None  # type: ignore
         self.transfer_block_num = 0

xinference/model/rerank/__init__.py CHANGED Viewed

@@ -16,38 +16,41 @@ import codecs
 import json
 import os
 import warnings
-from typing import Any, Dict
+from typing import Dict, List
 from ...constants import XINFERENCE_MODEL_DIR
+from ..utils import flatten_model_src
 from .core import (
-    MODEL_NAME_TO_REVISION,
     RERANK_MODEL_DESCRIPTIONS,
-    RerankModelSpec,
+    RerankModelFamilyV2,
     generate_rerank_description,
-    get_cache_status,
     get_rerank_model_descriptions,
 )
 from .custom import (
-    CustomRerankModelSpec,
+    CustomRerankModelFamilyV2,
     get_user_defined_reranks,
     register_rerank,
     unregister_rerank,
 )
-BUILTIN_RERANK_MODELS: Dict[str, Any] = {}
-MODELSCOPE_RERANK_MODELS: Dict[str, Any] = {}
+BUILTIN_RERANK_MODELS: Dict[str, List["RerankModelFamilyV2"]] = {}
 def register_custom_model():
+    from ..custom import migrate_from_v1_to_v2
+    # migrate from v1 to v2 first
+    migrate_from_v1_to_v2("rerank", CustomRerankModelFamilyV2)
     # if persist=True, load them when init
-    user_defined_rerank_dir = os.path.join(XINFERENCE_MODEL_DIR, "rerank")
+    user_defined_rerank_dir = os.path.join(XINFERENCE_MODEL_DIR, "v2", "rerank")
     if os.path.isdir(user_defined_rerank_dir):
         for f in os.listdir(user_defined_rerank_dir):
             try:
                 with codecs.open(
                     os.path.join(user_defined_rerank_dir, f), encoding="utf-8"
                 ) as fd:
-                    user_defined_rerank_spec = CustomRerankModelSpec.parse_obj(
+                    user_defined_rerank_spec = CustomRerankModelFamilyV2.parse_obj(
                         json.load(fd)
                     )
                     register_rerank(user_defined_rerank_spec, persist=False)
@@ -57,15 +60,11 @@ def register_custom_model():
 def _install():
     load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
-    load_model_family_from_json("model_spec_modelscope.json", MODELSCOPE_RERANK_MODELS)
-    # register model description after recording model revision
-    for model_spec_info in [BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS]:
-        for model_name, model_spec in model_spec_info.items():
-            if model_spec.model_name not in RERANK_MODEL_DESCRIPTIONS:
-                RERANK_MODEL_DESCRIPTIONS.update(
-                    generate_rerank_description(model_spec)
-                )
+    for model_name, model_specs in BUILTIN_RERANK_MODELS.items():
+        model_spec = [x for x in model_specs if x.model_hub == "huggingface"][0]
+        if model_spec.model_name not in RERANK_MODEL_DESCRIPTIONS:
+            RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(model_spec))
     register_custom_model()
@@ -76,12 +75,14 @@ def _install():
 def load_model_family_from_json(json_filename, target_families):
     _model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
-    target_families.update(
-        dict(
-            (spec["model_name"], RerankModelSpec(**spec))
-            for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
-        )
-    )
-    for model_name, model_spec in target_families.items():
-        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    flattened_model_specs = []
+    for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8")):
+        flattened_model_specs.extend(flatten_model_src(spec))
+    for spec in flattened_model_specs:
+        if spec["model_name"] not in target_families:
+            target_families[spec["model_name"]] = [RerankModelFamilyV2(**spec)]
+        else:
+            target_families[spec["model_name"]].append(RerankModelFamilyV2(**spec))
     del _model_spec_json

xinference/model/rerank/core.py CHANGED Viewed

@@ -21,24 +21,22 @@ import threading
 import uuid
 from collections import defaultdict
 from collections.abc import Sequence
-from typing import Dict, List, Literal, Optional, Tuple
+from typing import Dict, List, Literal, Optional
 import numpy as np
 import torch
 import torch.nn as nn
-from ...constants import XINFERENCE_CACHE_DIR
-from ...device_utils import empty_cache
+from ...device_utils import empty_cache, is_device_available
 from ...types import Document, DocumentObj, Rerank, RerankTokens
-from ..core import CacheableModelSpec, ModelDescription, VirtualEnvSettings
-from ..utils import is_model_cached
+from ..core import CacheableModelSpec, VirtualEnvSettings
+from ..utils import ModelInstanceInfoMixin
 from .utils import preprocess_sentence
 logger = logging.getLogger(__name__)
 # Used for check whether the model is cached.
 # Init when registering all the builtin models.
-MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
 RERANK_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
 RERANK_EMPTY_CACHE_COUNT = int(os.getenv("XINFERENCE_RERANK_EMPTY_CACHE_COUNT", "10"))
 assert RERANK_EMPTY_CACHE_COUNT > 0
@@ -50,7 +48,8 @@ def get_rerank_model_descriptions():
     return copy.deepcopy(RERANK_MODEL_DESCRIPTIONS)
-class RerankModelSpec(CacheableModelSpec):
+class RerankModelFamilyV2(CacheableModelSpec, ModelInstanceInfoMixin):
+    version: Literal[2]
     model_name: str
     language: List[str]
     type: Optional[str] = "unknown"
@@ -60,56 +59,37 @@ class RerankModelSpec(CacheableModelSpec):
     model_hub: str = "huggingface"
     virtualenv: Optional[VirtualEnvSettings]
+    class Config:
+        extra = "allow"
-class RerankModelDescription(ModelDescription):
-    def __init__(
-        self,
-        address: Optional[str],
-        devices: Optional[List[str]],
-        model_spec: RerankModelSpec,
-        model_path: Optional[str] = None,
-    ):
-        super().__init__(address, devices, model_path=model_path)
-        self._model_spec = model_spec
-    @property
-    def spec(self):
-        return self._model_spec
-    def to_dict(self):
+    def to_description(self):
         return {
             "model_type": "rerank",
-            "address": self.address,
-            "accelerators": self.devices,
-            "type": self._model_spec.type,
-            "model_name": self._model_spec.model_name,
-            "language": self._model_spec.language,
-            "model_revision": self._model_spec.model_revision,
+            "address": getattr(self, "address", None),
+            "accelerators": getattr(self, "accelerators", None),
+            "type": self.type,
+            "model_name": self.model_name,
+            "language": self.language,
+            "model_revision": self.model_revision,
         }
     def to_version_info(self):
-        from .utils import get_model_version
-        if self._model_path is None:
-            is_cached = get_cache_status(self._model_spec)
-            file_location = get_cache_dir(self._model_spec)
-        else:
-            is_cached = True
-            file_location = self._model_path
+        from ..cache_manager import CacheManager
+        cache_manager = CacheManager(self)
         return {
-            "model_version": get_model_version(self._model_spec),
-            "model_file_location": file_location,
-            "cache_status": is_cached,
-            "language": self._model_spec.language,
+            "model_version": self.model_name,
+            "model_file_location": cache_manager.get_cache_dir(),
+            "cache_status": cache_manager.get_cache_status(),
+            "language": self.language,
         }
-def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[Dict]]:
+def generate_rerank_description(
+    model_spec: RerankModelFamilyV2,
+) -> Dict[str, List[Dict]]:
     res = defaultdict(list)
-    res[model_spec.model_name].append(
-        RerankModelDescription(None, None, model_spec).to_version_info()
-    )
+    res[model_spec.model_name].append(model_spec.to_version_info())
     return res
@@ -145,13 +125,14 @@ class _ModelWrapper(nn.Module):
 class RerankModel:
     def __init__(
         self,
-        model_spec: RerankModelSpec,
+        model_spec: RerankModelFamilyV2,
         model_uid: str,
         model_path: Optional[str] = None,
         device: Optional[str] = None,
         use_fp16: bool = False,
         model_config: Optional[Dict] = None,
     ):
+        self.model_family = model_spec
         self._model_spec = model_spec
         self._model_uid = model_uid
         self._model_path = model_path
@@ -252,7 +233,9 @@ class RerankModel:
             tokenizer = AutoTokenizer.from_pretrained(
                 self._model_path, padding_side="left"
             )
-            enable_flash_attn = self._model_config.get("enable_flash_attn", True)
+            enable_flash_attn = self._model_config.pop(
+                "enable_flash_attn", is_device_available("cuda")
+            )
             model_kwargs = {"device_map": "auto"}
             if flash_attn_installed and enable_flash_attn:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
@@ -448,25 +431,7 @@ class RerankModel:
         return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata)
-def get_cache_dir(model_spec: RerankModelSpec):
-    return os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name))
-def get_cache_status(
-    model_spec: RerankModelSpec,
-) -> bool:
-    return is_model_cached(model_spec, MODEL_NAME_TO_REVISION)
-def cache(model_spec: RerankModelSpec):
-    from ..utils import cache
-    return cache(model_spec, RerankModelDescription)
 def create_rerank_model_instance(
-    subpool_addr: str,
-    devices: List[str],
     model_uid: str,
     model_name: str,
     download_hub: Optional[
@@ -474,9 +439,10 @@ def create_rerank_model_instance(
     ] = None,
     model_path: Optional[str] = None,
     **kwargs,
-) -> Tuple[RerankModel, RerankModelDescription]:
+) -> RerankModel:
+    from ..cache_manager import CacheManager
     from ..utils import download_from_modelscope
-    from . import BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS
+    from . import BUILTIN_RERANK_MODELS
     from .custom import get_user_defined_reranks
     model_spec = None
@@ -486,31 +452,25 @@ def create_rerank_model_instance(
             break
     if model_spec is None:
-        if download_hub == "huggingface" and model_name in BUILTIN_RERANK_MODELS:
-            logger.debug(f"Rerank model {model_name} found in Huggingface.")
-            model_spec = BUILTIN_RERANK_MODELS[model_name]
-        elif download_hub == "modelscope" and model_name in MODELSCOPE_RERANK_MODELS:
-            logger.debug(f"Rerank model {model_name} found in ModelScope.")
-            model_spec = MODELSCOPE_RERANK_MODELS[model_name]
-        elif download_from_modelscope() and model_name in MODELSCOPE_RERANK_MODELS:
-            logger.debug(f"Rerank model {model_name} found in ModelScope.")
-            model_spec = MODELSCOPE_RERANK_MODELS[model_name]
-        elif model_name in BUILTIN_RERANK_MODELS:
-            logger.debug(f"Rerank model {model_name} found in Huggingface.")
-            model_spec = BUILTIN_RERANK_MODELS[model_name]
+        if model_name in BUILTIN_RERANK_MODELS:
+            model_specs = BUILTIN_RERANK_MODELS[model_name]
+            if download_hub == "modelscope" or download_from_modelscope():
+                model_spec = (
+                    [x for x in model_specs if x.model_hub == "modelscope"]
+                    + [x for x in model_specs if x.model_hub == "huggingface"]
+                )[0]
+            else:
+                model_spec = [x for x in model_specs if x.model_hub == "huggingface"][0]
         else:
             raise ValueError(
-                f"Rerank model {model_name} not found, available"
-                f"Huggingface: {BUILTIN_RERANK_MODELS.keys()}"
-                f"ModelScope: {MODELSCOPE_RERANK_MODELS.keys()}"
+                f"Rerank model {model_name} not found, available "
+                f"model: {BUILTIN_RERANK_MODELS.keys()}"
             )
     if not model_path:
-        model_path = cache(model_spec)
+        cache_manager = CacheManager(model_spec)
+        model_path = cache_manager.cache()
     use_fp16 = kwargs.pop("use_fp16", False)
     model = RerankModel(
         model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
     )
-    model_description = RerankModelDescription(
-        subpool_addr, devices, model_spec, model_path=model_path
-    )
-    return model, model_description
+    return model

xinference 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

xinference 1.7.1py3-none-any.whl → 1.8.0py3-none-any.whl