PyPI - xinference - Versions diffs - 0.15.2__py3-none-any.whl → 0.15.3__py3-none-any.whl - Mend

xinference 0.15.2py3-none-any.whl → 0.15.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (28) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-09-20T16:58:06+0800",
+ "date": "2024-09-30T20:17:26+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "5de46e94c23785fa7e17e3e1d00c3afb6cb1c919",
- "version": "0.15.2"
+ "full-revisionid": "00a9ee15279a60a6d75393c4720d8da5cbbf5796",
+ "version": "0.15.3"
 }
 '''  # END VERSION_JSON

xinference/model/embedding/core.py CHANGED Viewed

@@ -141,7 +141,15 @@ class EmbeddingModel:
     def load(self):
         try:
+            import sentence_transformers
             from sentence_transformers import SentenceTransformer
+            if sentence_transformers.__version__ < "3.1.0":
+                raise ValueError(
+                    "The sentence_transformers version must be greater than 3.1.0. "
+                    "Please upgrade your version via `pip install -U sentence_transformers` or refer to "
+                    "https://github.com/UKPLab/sentence-transformers"
+                )
         except ImportError:
             error_message = "Failed to import module 'SentenceTransformer'"
             installation_guide = [
@@ -173,9 +181,6 @@ class EmbeddingModel:
                 )
                 torch_dtype = torch.float32
-        from ..utils import patch_trust_remote_code
-        patch_trust_remote_code()
         if (
             "gte" in self._model_spec.model_name.lower()
             and "qwen2" in self._model_spec.model_name.lower()
@@ -191,7 +196,10 @@ class EmbeddingModel:
         else:
             model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
             self._model = SentenceTransformer(
-                self._model_path, device=self._device, model_kwargs=model_kwargs
+                self._model_path,
+                device=self._device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
             )
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
@@ -213,6 +221,7 @@ class EmbeddingModel:
             convert_to_tensor: bool = False,
             device: str = None,
             normalize_embeddings: bool = False,
+            **kwargs,
         ):
             """
             Computes sentence embeddings
@@ -317,7 +326,7 @@ class EmbeddingModel:
                 all_token_nums += features["attention_mask"].sum().item()
                 with torch.no_grad():
-                    out_features = model.forward(features)
+                    out_features = model.forward(features, **kwargs)
                     if output_value == "token_embeddings":
                         embeddings = []

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -238,5 +238,12 @@
     "language": ["zh", "en"],
     "model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
     "model_revision": "e26182b2122f4435e8b3ebecbf363990f409b45b"
+  },
+  {
+    "model_name": "jina-embeddings-v3",
+    "dimensions": 1024,
+    "max_tokens": 8192,
+    "language": ["zh", "en"],
+    "model_id": "jinaai/jina-embeddings-v3"
   }
 ]

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -233,12 +233,20 @@
     "model_id": "AI-ModelScope/m3e-large",
     "model_hub": "modelscope"
   },
-    {
+  {
     "model_name": "gte-Qwen2",
     "dimensions": 4096,
     "max_tokens": 32000,
     "language": ["zh", "en"],
     "model_id": "iic/gte_Qwen2-7B-instruct",
     "model_hub": "modelscope"
+  },
+  {
+    "model_name": "jina-embeddings-v3",
+    "dimensions": 1024,
+    "max_tokens": 8192,
+    "language": ["zh", "en"],
+    "model_id": "jinaai/jina-embeddings-v3",
+    "model_hub": "modelscope"
   }
 ]

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -193,6 +193,18 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             self._model_path,
             **self._kwargs,
         )
+        if self._kwargs.get("deepcache", True):
+            # NOTE: DeepCache should be loaded first before cpu_offloading
+            try:
+                from DeepCache import DeepCacheSDHelper
+                helper = DeepCacheSDHelper(pipe=self._model)
+                helper.set_params(cache_interval=3, cache_branch_id=0)
+                helper.enable()
+            except ImportError:
+                logger.debug("deepcache is not installed")
+                pass
         if self._kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
             self._model.enable_model_cpu_offload()

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -6483,8 +6483,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-1B",
-          "model_revision": "a9fc14aea824b6ea1d44f8778cad6b35512c4ce1"
+          "model_id": "OpenGVLab/InternVL2-1B"
         },
         {
           "model_format": "pytorch",
@@ -6494,8 +6493,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-2B",
-          "model_revision": "422ad7c6335917bfb514958233955512338485a6"
+          "model_id": "OpenGVLab/InternVL2-2B"
         },
         {
           "model_format": "awq",
@@ -6503,8 +6501,7 @@
           "quantizations": [
             "Int4"
           ],
-          "model_id": "OpenGVLab/InternVL2-2B-AWQ",
-          "model_revision": "701bc3fc098a8a3b686b3b4135cfb77202be89e0"
+          "model_id": "OpenGVLab/InternVL2-2B-AWQ"
         },
         {
           "model_format": "pytorch",
@@ -6514,8 +6511,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-4B",
-          "model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
+          "model_id": "OpenGVLab/InternVL2-4B"
         },
         {
           "model_format": "pytorch",
@@ -6525,8 +6521,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-8B",
-          "model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
+          "model_id": "OpenGVLab/InternVL2-8B"
         },
         {
           "model_format": "awq",
@@ -6534,8 +6529,7 @@
           "quantizations": [
             "Int4"
           ],
-          "model_id": "OpenGVLab/InternVL2-8B-AWQ",
-          "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
+          "model_id": "OpenGVLab/InternVL2-8B-AWQ"
         },
         {
           "model_format": "pytorch",
@@ -6545,8 +6539,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-26B",
-          "model_revision": "b9f3c7e6d575b0115e076a3ffc46fd20b7586899"
+          "model_id": "OpenGVLab/InternVL2-26B"
         },
         {
           "model_format": "awq",
@@ -6554,8 +6547,7 @@
           "quantizations": [
             "Int4"
           ],
-          "model_id": "OpenGVLab/InternVL2-26B-AWQ",
-          "model_revision": "469e0019ffd251e22ff6501a5c2321964e86ef0d"
+          "model_id": "OpenGVLab/InternVL2-26B-AWQ"
         },
         {
           "model_format": "pytorch",
@@ -6565,8 +6557,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-40B",
-          "model_revision": "725a12063bb855c966e30a0617d0ccd9e870d772"
+          "model_id": "OpenGVLab/InternVL2-40B"
         },
         {
           "model_format": "awq",
@@ -6574,8 +6565,7 @@
           "quantizations": [
             "Int4"
           ],
-          "model_id": "OpenGVLab/InternVL2-40B-AWQ",
-          "model_revision": "d92e140f6dfe8ea9679924c6a31898f42c4e1846"
+          "model_id": "OpenGVLab/InternVL2-40B-AWQ"
         },
         {
           "model_format": "pytorch",
@@ -6585,8 +6575,7 @@
             "8-bit",
             "none"
           ],
-          "model_id": "OpenGVLab/InternVL2-Llama3-76B",
-          "model_revision": "cf7914905f78e9e3560ddbd6f5dfc39becac494f"
+          "model_id": "OpenGVLab/InternVL2-Llama3-76B"
         },
         {
           "model_format": "awq",
@@ -6594,8 +6583,7 @@
           "quantizations": [
             "Int4"
           ],
-          "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
-          "model_revision": "1bc796bf80f2ebc7d6a14c15f55217a4600d50a4"
+          "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ"
         }
     ],
     "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -4334,16 +4334,8 @@
         }
     ],
     "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-    "stop_token_ids": [
-      151643,
-      151644,
-      151645
-    ],
-    "stop": [
-      "<|endoftext|>",
-      "<|im_start|>",
-      "<|im_end|>"
-    ]
+    "stop_token_ids": [],
+    "stop": []
   },
   {
     "version": 1,

xinference/model/llm/utils.py CHANGED Viewed

@@ -159,14 +159,25 @@ class ChatModelMixin:
                         for image_url in image_urls:
                             fut = executor.submit(_decode_image, image_url)
                             image_futures.append(fut)
-                    images = [fut.result() for fut in image_futures]
+                    images.extend([fut.result() for fut in image_futures])
                     if len(image_futures) == 0:
                         ret += role + "\n" + text + intra_message_sep + "\n"
                     else:
+                        placeholders = "\n".join(
+                            f"Image-{i+1}: <image>\n"
+                            for i in range(
+                                len(images) - len(image_futures), len(images)
+                            )
+                        )
                         ret += (
-                            role + "\n" + f"<image>\n{text}" + intra_message_sep + "\n"
+                            role
+                            + "\n"
+                            + f"{placeholders}\n{text}"
+                            + intra_message_sep
+                            + "\n"
                         )
+            if len(images) == 1:
+                ret = ret.replace("Image-1: <image>\n", "<image>\n")
             return ret, images
         else:
             raise ValueError(f"Invalid model family: {model_family}")

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import asyncio
+import json
 import logging
 import multiprocessing
 import os
@@ -47,6 +48,7 @@ from ..utils import (
     ChatModelMixin,
     generate_completion_chunk,
 )
+from .utils import vllm_check
 logger = logging.getLogger(__name__)
@@ -65,6 +67,7 @@ class VLLMModelConfig(TypedDict, total=False):
     max_num_seqs: int
     quantization: Optional[str]
     max_model_len: Optional[int]
+    limit_mm_per_prompt: Optional[Dict[str, int]]
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -90,9 +93,7 @@ try:
 except ImportError:
     VLLM_INSTALLED = False
-VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
-    "internvl2",
-]
+VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
 VLLM_SUPPORTED_MODELS = [
     "llama-2",
     "llama-3",
@@ -171,6 +172,9 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
     VLLM_SUPPORTED_MODELS.append("llama-3.1")
     VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
+if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
 class VLLMModel(LLM):
     def __init__(
@@ -304,7 +308,12 @@ class VLLMModel(LLM):
         model_config.setdefault("gpu_memory_utilization", 0.90)
         model_config.setdefault("max_num_seqs", 256)
         model_config.setdefault("quantization", None)
-        model_config.setdefault("max_model_len", 4096)
+        model_config.setdefault("max_model_len", None)
+        model_config["limit_mm_per_prompt"] = (
+            json.loads(model_config.get("limit_mm_per_prompt"))  # type: ignore
+            if model_config.get("limit_mm_per_prompt")
+            else None
+        )
         return model_config
@@ -434,6 +443,7 @@ class VLLMModel(LLM):
             usage=usage,
         )
+    @vllm_check
     async def async_generate(
         self,
         prompt: Union[str, Dict[str, Any]],
@@ -665,6 +675,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                     yield self._to_chat_completion_chunk(chunk)
             i += 1
+    @vllm_check
     async def async_chat(
         self,
         messages: List[Dict],
@@ -741,13 +752,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
                     )
         return generate_config
+    @vllm_check
     async def async_chat(
         self,
         messages: List[Dict],
         generate_config: Optional[Dict] = None,
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
-        # only support single image, waiting vllm support multi images
         model_family = self.model_family.model_family or self.model_family.model_name
         prompt, images = self.get_specific_prompt(model_family, messages)
@@ -755,11 +766,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             inputs = {
                 "prompt": prompt,
             }
-        else:
+        elif len(images) == 1:
             inputs = {
                 "prompt": prompt,
                 "multi_modal_data": {"image": images[-1]},  # type: ignore
             }
+        else:
+            inputs = {
+                "prompt": prompt,
+                "multi_modal_data": {"image": images},  # type: ignore
+            }
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)

xinference/model/llm/vllm/utils.py ADDED Viewed

@@ -0,0 +1,42 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import logging
+import os
+logger = logging.getLogger(__name__)
+def vllm_check(fn):
+    try:
+        from vllm.engine.async_llm_engine import AsyncEngineDeadError
+    except:
+        return fn
+    @functools.wraps(fn)
+    async def _async_wrapper(self, *args, **kwargs):
+        logger.info("vllm_check")
+        try:
+            return await fn(self, *args, **kwargs)
+        except AsyncEngineDeadError:
+            logger.info("Detecting vLLM is not health, prepare to quit the process")
+            try:
+                self.stop()
+            except:
+                # ignore error when stop
+                pass
+            # Just kill the process and let xinference auto-recover the model
+            os._exit(1)
+    return _async_wrapper

xinference/model/rerank/core.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import gc
+import importlib
 import logging
 import os
 import threading
@@ -178,9 +179,27 @@ class RerankModel:
         return rerank_type
     def load(self):
+        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        if (
+            self._auto_detect_type(self._model_path) != "normal"
+            and flash_attn_installed
+        ):
+            logger.warning(
+                "flash_attn can only support fp16 and bf16, "
+                "will force set `use_fp16` to True"
+            )
+            self._use_fp16 = True
         if self._model_spec.type == "normal":
             try:
+                import sentence_transformers
                 from sentence_transformers.cross_encoder import CrossEncoder
+                if sentence_transformers.__version__ < "3.1.0":
+                    raise ValueError(
+                        "The sentence_transformers version must be greater than 3.1.0. "
+                        "Please upgrade your version via `pip install -U sentence_transformers` or refer to "
+                        "https://github.com/UKPLab/sentence-transformers"
+                    )
             except ImportError:
                 error_message = "Failed to import module 'sentence-transformers'"
                 installation_guide = [

xinference/model/rerank/model_spec.json CHANGED Viewed

@@ -54,5 +54,13 @@
     "max_tokens": 1024,
     "model_id": "jinaai/jina-reranker-v2-base-multilingual",
     "model_revision": "298e48cada4a9318650d7fbd795f63827f884087"
+  },
+  {
+    "model_name": "minicpm-reranker",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 1024,
+    "model_id": "openbmb/MiniCPM-Reranker",
+    "model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
   }
 ]

xinference/model/rerank/model_spec_modelscope.json CHANGED Viewed

@@ -49,5 +49,13 @@
     "max_tokens": 2048,
     "model_id": "mirror013/bge-reranker-v2-minicpm-layerwise",
     "model_hub": "modelscope"
+  },
+  {
+    "model_name": "minicpm-reranker",
+    "type": "normal",
+    "language": ["en", "zh"],
+    "max_tokens": 1024,
+    "model_id": "OpenBMB/MiniCPM-Reranker",
+    "model_hub": "modelscope"
   }
 ]

xinference/model/utils.py CHANGED Viewed

@@ -300,31 +300,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
     return cache_dir
-def patch_trust_remote_code():
-    """sentence-transformers calls transformers without the trust_remote_code=True, some embedding
-    models will fail to load, e.g. jina-embeddings-v2-base-en
-    :return:
-    """
-    try:
-        from transformers.dynamic_module_utils import resolve_trust_remote_code
-    except ImportError:
-        logger.error("Patch transformers trust_remote_code failed.")
-    else:
-        def _patched_resolve_trust_remote_code(*args, **kwargs):
-            logger.info("Patched resolve_trust_remote_code: %s %s", args, kwargs)
-            return True
-        if (
-            resolve_trust_remote_code.__code__
-            != _patched_resolve_trust_remote_code.__code__
-        ):
-            resolve_trust_remote_code.__code__ = (
-                _patched_resolve_trust_remote_code.__code__
-            )
 def select_device(device):
     try:
         import torch  # noqa: F401

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.5061c4c3.css",
-    "main.js": "./static/js/main.29578905.js",
+    "main.js": "./static/js/main.e51a356d.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
-    "main.29578905.js.map": "./static/js/main.29578905.js.map"
+    "main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
   },
   "entrypoints": [
     "static/css/main.5061c4c3.css",
-    "static/js/main.29578905.js"
+    "static/js/main.e51a356d.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~29578905~~.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e51a356d.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.15.2__py3-none-any.whl → 0.15.3__py3-none-any.whl

Potentially problematic release.

xinference 0.15.2py3-none-any.whl → 0.15.3py3-none-any.whl