PyPI - xinference - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

xinference 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-03-29T12:46:14+0800",
+ "date": "2024-04-11T15:35:46+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "2857ec497afbd2a6895d3658384ff3b4022b2840",
- "version": "0.10.0"
+ "full-revisionid": "e3a947ebddfc53b5e8ec723c1f632c2b895edef1",
+ "version": "0.10.1"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -1007,8 +1007,16 @@ class RESTfulAPI:
                 raise HTTPException(status_code=500, detail=str(e))
     async def create_embedding(self, request: Request) -> Response:
-        body = CreateEmbeddingRequest.parse_obj(await request.json())
+        payload = await request.json()
+        body = CreateEmbeddingRequest.parse_obj(payload)
         model_uid = body.model
+        exclude = {
+            "model",
+            "input",
+            "user",
+            "encoding_format",
+        }
+        kwargs = {key: value for key, value in payload.items() if key not in exclude}
         try:
             model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1022,7 +1030,7 @@ class RESTfulAPI:
             raise HTTPException(status_code=500, detail=str(e))
         try:
-            embedding = await model.create_embedding(body.input)
+            embedding = await model.create_embedding(body.input, **kwargs)
             return Response(embedding, media_type="application/json")
         except RuntimeError as re:
             logger.error(re, exc_info=True)
@@ -1035,8 +1043,15 @@ class RESTfulAPI:
             raise HTTPException(status_code=500, detail=str(e))
     async def rerank(self, request: Request) -> Response:
-        body = RerankRequest.parse_obj(await request.json())
+        payload = await request.json()
+        body = RerankRequest.parse_obj(payload)
         model_uid = body.model
+        kwargs = {
+            key: value
+            for key, value in payload.items()
+            if key not in RerankRequest.__annotations__.keys()
+        }
         try:
             model = await (await self._get_supervisor_ref()).get_model(model_uid)
         except ValueError as ve:
@@ -1055,6 +1070,7 @@ class RESTfulAPI:
                 top_n=body.top_n,
                 max_chunks_per_doc=body.max_chunks_per_doc,
                 return_documents=body.return_documents,
+                **kwargs,
             )
             return Response(scores, media_type="application/json")
         except RuntimeError as re:
@@ -1345,9 +1361,12 @@ class RESTfulAPI:
                     detail=f"Only {function_call_models} support tool messages",
                 )
         if body.tools and body.stream:
-            raise HTTPException(
-                status_code=400, detail="Tool calls does not support stream"
-            )
+            is_vllm = await model.is_vllm_backend()
+            if not is_vllm or model_family not in ["qwen-chat", "qwen1.5-chat"]:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Streaming support for tool calls is available only when using vLLM backend and Qwen models.",
+                )
         if body.stream:

xinference/client/oscar/actor_client.py CHANGED Viewed

@@ -111,7 +111,7 @@ class ClientIteratorWrapper(AsyncIterator):
 class EmbeddingModelHandle(ModelHandle):
-    def create_embedding(self, input: Union[str, List[str]]) -> bytes:
+    def create_embedding(self, input: Union[str, List[str]], **kwargs) -> bytes:
         """
         Creates an embedding vector representing the input text.
@@ -128,7 +128,7 @@ class EmbeddingModelHandle(ModelHandle):
             machine learning models and algorithms.
         """
-        coro = self._model_ref.create_embedding(input)
+        coro = self._model_ref.create_embedding(input, **kwargs)
         return orjson.loads(self._isolation.call(coro))
@@ -140,6 +140,7 @@ class RerankModelHandle(ModelHandle):
         top_n: Optional[int],
         max_chunks_per_doc: Optional[int],
         return_documents: Optional[bool],
+        **kwargs,
     ):
         """
         Returns an ordered list of documents ordered by their relevance to the provided query.
@@ -163,7 +164,7 @@ class RerankModelHandle(ModelHandle):
         """
         coro = self._model_ref.rerank(
-            documents, query, top_n, max_chunks_per_doc, return_documents
+            documents, query, top_n, max_chunks_per_doc, return_documents, **kwargs
         )
         results = orjson.loads(self._isolation.call(coro))
         for r in results["results"]:

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -80,7 +80,7 @@ class RESTfulModelHandle:
 class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
-    def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
+    def create_embedding(self, input: Union[str, List[str]], **kwargs) -> "Embedding":
         """
         Create an Embedding from user input via RESTful APIs.
@@ -102,7 +102,11 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
         """
         url = f"{self._base_url}/v1/embeddings"
-        request_body = {"model": self._model_uid, "input": input}
+        request_body = {
+            "model": self._model_uid,
+            "input": input,
+        }
+        request_body.update(kwargs)
         response = requests.post(url, json=request_body, headers=self.auth_headers)
         if response.status_code != 200:
             raise RuntimeError(
@@ -121,6 +125,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
         top_n: Optional[int] = None,
         max_chunks_per_doc: Optional[int] = None,
         return_documents: Optional[bool] = None,
+        **kwargs,
     ):
         """
         Returns an ordered list of documents ordered by their relevance to the provided query.
@@ -156,6 +161,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
             "max_chunks_per_doc": max_chunks_per_doc,
             "return_documents": return_documents,
         }
+        request_body.update(kwargs)
         response = requests.post(url, json=request_body, headers=self.auth_headers)
         if response.status_code != 200:
             raise RuntimeError(

xinference/core/supervisor.py CHANGED Viewed

@@ -870,6 +870,12 @@ class SupervisorActor(xo.StatelessActor):
                             address,
                             dead_models,
                         )
+                        for replica_model_uid in dead_models:
+                            model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
+                            self._model_uid_to_replica_info.pop(model_uid, None)
+                            self._replica_model_uid_to_worker.pop(
+                                replica_model_uid, None
+                            )
                         dead_nodes.append(address)
                     elif (
                         status.failure_remaining_count
@@ -979,6 +985,16 @@ class SupervisorActor(xo.StatelessActor):
     @log_async(logger=logger)
     async def remove_worker(self, worker_address: str):
+        uids_to_remove = []
+        for model_uid in self._replica_model_uid_to_worker:
+            if self._replica_model_uid_to_worker[model_uid].address == worker_address:
+                uids_to_remove.append(model_uid)
+        for replica_model_uid in uids_to_remove:
+            model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
+            self._model_uid_to_replica_info.pop(model_uid, None)
+            self._replica_model_uid_to_worker.pop(replica_model_uid, None)
         if worker_address in self._worker_address_to_worker:
             del self._worker_address_to_worker[worker_address]
             logger.debug("Worker %s has been removed successfully", worker_address)

xinference/model/embedding/core.py CHANGED Viewed

@@ -136,7 +136,7 @@ class EmbeddingModel:
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
         from sentence_transformers import SentenceTransformer
-        normalize_embeddings = kwargs.pop("normalize_embeddings", True)
+        kwargs.setdefault("normalize_embeddings", True)
         # copied from sentence-transformers, and modify it to return tokens num
         @no_type_check
@@ -272,7 +272,6 @@ class EmbeddingModel:
             self._model,
             sentences,
             convert_to_numpy=False,
-            normalize_embeddings=normalize_embeddings,
             **kwargs,
         )
         if isinstance(sentences, str):

xinference/model/llm/__init__.py CHANGED Viewed

@@ -49,7 +49,6 @@ from .llm_family import (
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
-    from .ggml.ctransformers import CtransformersModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
@@ -77,11 +76,6 @@ def _install():
             ChatglmCppChatModel,
         ]
     )
-    LLM_CLASSES.extend(
-        [
-            CtransformersModel,
-        ]
-    )
     LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
     LLM_CLASSES.extend(

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -30,7 +30,6 @@ from ....types import (
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import ChatModelMixin
-from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
 logger = logging.getLogger(__name__)
@@ -182,11 +181,7 @@ class LlamaCppModel(LLM):
     ) -> bool:
         if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
             return False
-        if (
-            "chatglm" in llm_family.model_name
-            or "qwen" in llm_family.model_name
-            or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
-        ):
+        if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
             return False
         if "generate" not in llm_family.model_ability:
             return False
@@ -250,10 +245,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
     ) -> bool:
         if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
             return False
-        if (
-            "chatglm" in llm_family.model_name
-            or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
-        ):
+        if "chatglm" in llm_family.model_name:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -913,6 +913,38 @@
         "model_id": "meta-llama/Llama-2-7b-chat-hf",
         "model_revision": "08751db2aca9bf2f7f80d2e516117a53d7450235"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-Chat-GPTQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-Chat-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-Chat-AWQ"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 13,
@@ -924,6 +956,22 @@
         "model_id": "meta-llama/Llama-2-13b-chat-hf",
         "model_revision": "0ba94ac9b9e1d5a0037780667e8b219adde1908c"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-chat-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-chat-AWQ"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 70,
@@ -1045,6 +1093,22 @@
         "model_id": "TheBloke/Llama-2-7B-GGML",
         "model_file_name_template": "llama-2-7b.ggmlv3.{quantization}.bin"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-AWQ"
+      },
       {
         "model_format": "ggmlv3",
         "model_size_in_billions": 13,
@@ -1111,6 +1175,22 @@
         "model_id": "meta-llama/Llama-2-13b-hf",
         "model_revision": "db6b8eb1feabb38985fdf785a89895959e944936"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-AWQ"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 70,
@@ -1121,6 +1201,22 @@
         ],
         "model_id": "meta-llama/Llama-2-70b-hf",
         "model_revision": "cc8aa03a000ff08b4d5c5b39673321a2a396c396"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-AWQ"
       }
     ]
   },
@@ -1509,6 +1605,16 @@
         ],
         "model_id": "Qwen/Qwen1.5-14B-Chat"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 72,
@@ -1564,6 +1670,14 @@
         ],
         "model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 72,
@@ -1613,6 +1727,14 @@
         ],
         "model_id": "Qwen/Qwen1.5-14B-Chat-AWQ"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat-AWQ"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": 72,
@@ -1701,6 +1823,22 @@
         "model_id": "Qwen/Qwen1.5-14B-Chat-GGUF",
         "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
       },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 72,
@@ -1740,6 +1878,58 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-moe-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,
@@ -1780,13 +1970,13 @@
     "model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
     "model_specs": [
       {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 1,
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
         "quantizations": [
           "none"
         ],
-        "model_id": "marella/gpt-2-ggml",
-        "model_file_name_template": "ggml-model.bin"
+        "model_id": "openai-community/gpt2",
+        "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
       }
     ]
   },
@@ -2569,6 +2759,22 @@
         "model_id": "mistralai/Mistral-7B-Instruct-v0.1",
         "model_revision": "54766df6d50e4d3d7ccd66758e5341ba105a6d36"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 7,
@@ -2630,6 +2836,22 @@
         "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
         "model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 7,
@@ -2790,6 +3012,14 @@
         "model_id": "mistralai/Mixtral-8x7B-v0.1",
         "model_revision": "58301445dc1378584211722b7ebf8743ec4e192b"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "46_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mixtral-8x7B-v0.1-GPTQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "46_7",
@@ -2839,10 +3069,17 @@
         "model_format": "awq",
         "model_size_in_billions": "46_7",
         "quantizations": [
-          "4-bit"
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "46_7",
+        "quantizations": [
+          "Int4"
         ],
-        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
-        "model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
+        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
       },
       {
         "model_format": "ggufv2",

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -199,6 +199,21 @@ class CustomLLMFamilyV1(LLMFamilyV1):
                 )
             llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
+        # check model ability, registering LLM only provides generate and chat
+        # but for vision models, we add back the abilities so that
+        # gradio chat interface can be generated properly
+        if (
+            llm_spec.model_family != "other"
+            and llm_spec.model_family
+            in {
+                family.model_name
+                for family in BUILTIN_LLM_FAMILIES
+                if "vision" in family.model_ability
+            }
+            and "vision" not in llm_spec.model_ability
+        ):
+            llm_spec.model_ability.append("vision")
         return llm_spec

xinference 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

xinference 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl