PyPI - xinference - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

xinference 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (258) hide show

xinference/deploy/supervisor.py CHANGED Viewed

@@ -22,6 +22,10 @@ from typing import Dict, Optional
 import xoscar as xo
 from xoscar.utils import get_next_port
+from ..constants import (
+    XINFERENCE_HEALTH_CHECK_ATTEMPTS,
+    XINFERENCE_HEALTH_CHECK_INTERVAL,
+)
 from ..core.supervisor import SupervisorActor
 from .utils import health_check
@@ -66,11 +70,20 @@ def run_in_subprocess(
     return p
-def main(host: str, port: int, logging_conf: Optional[Dict] = None):
-    supervisor_address = f"{host}:{get_next_port()}"
+def main(
+    host: str,
+    port: int,
+    supervisor_port: Optional[int],
+    logging_conf: Optional[Dict] = None,
+):
+    supervisor_address = f"{host}:{supervisor_port or get_next_port()}"
     local_cluster = run_in_subprocess(supervisor_address, logging_conf)
-    if not health_check(address=supervisor_address, max_attempts=3, sleep_interval=1):
+    if not health_check(
+        address=supervisor_address,
+        max_attempts=XINFERENCE_HEALTH_CHECK_ATTEMPTS,
+        sleep_interval=XINFERENCE_HEALTH_CHECK_INTERVAL,
+    ):
         raise RuntimeError("Supervisor is not available after multiple attempts")
     try:

xinference/deploy/utils.py CHANGED Viewed

@@ -110,6 +110,7 @@ async def create_worker_actor_pool(
     return await xo.create_actor_pool(
         address=address,
         n_process=0,
+        auto_recover="process",
         subprocess_start_method=subprocess_start_method,
         logging_conf={"dict": logging_conf},
     )

xinference/deploy/worker.py CHANGED Viewed

@@ -18,10 +18,10 @@ import os
 from typing import Any, Optional
 import xoscar as xo
-from xorbits._mars.resource import cuda_count
 from xoscar import MainActorPoolType
 from ..core.worker import WorkerActor
+from ..utils import cuda_count
 logger = logging.getLogger(__name__)

xinference/model/embedding/__init__.py CHANGED Viewed

@@ -33,5 +33,15 @@ MODELSCOPE_EMBEDDING_MODELS = dict(
         codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
     )
 )
+from ...constants import XINFERENCE_MODEL_DIR
+user_defined_llm_dir = os.path.join(XINFERENCE_MODEL_DIR, "embedding")
+if os.path.isdir(user_defined_llm_dir):
+    for f in os.listdir(user_defined_llm_dir):
+        with codecs.open(os.path.join(user_defined_llm_dir, f), encoding="utf-8") as fd:
+            user_defined_llm_family = CustomEmbeddingModelSpec.parse_obj(json.load(fd))
+            register_embedding(user_defined_llm_family, persist=False)
 del _model_spec_json
 del _model_spec_modelscope_json

xinference/model/embedding/core.py CHANGED Viewed

@@ -220,6 +220,9 @@ class EmbeddingModel:
             ]
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        from ..utils import patch_trust_remote_code
+        patch_trust_remote_code()
         self._model = SentenceTransformer(self._model_path, device=self._device)
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):

xinference/model/embedding/custom.py CHANGED Viewed

@@ -49,10 +49,6 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
             f" or a digit, and can only contain letters, digits, underscores, or dashes."
         )
-    model_uri = model_spec.model_uri
-    if model_uri and not is_valid_model_uri(model_uri):
-        raise ValueError(f"Invalid model URI {model_uri}.")
     with UD_EMBEDDING_LOCK:
         for model_name in (
             list(BUILTIN_EMBEDDING_MODELS.keys())
@@ -67,6 +63,11 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
         UD_EMBEDDINGS.append(model_spec)
     if persist:
+        # We only validate model URL when persist is True.
+        model_uri = model_spec.model_uri
+        if model_uri and not is_valid_model_uri(model_uri):
+            raise ValueError(f"Invalid model URI {model_uri}.")
         persist_path = os.path.join(
             XINFERENCE_MODEL_DIR, "embedding", f"{model_spec.model_name}.json"
         )

xinference/model/embedding/model_spec.json CHANGED Viewed

@@ -126,5 +126,21 @@
     "language": ["en"],
     "model_id": "BAAI/bge-large-en-v1.5",
     "model_revision": "5888da4a3a013e65d33dd6f612ecd4625eb87a7d"
+  },
+  {
+    "model_name": "jina-embeddings-v2-small-en",
+    "dimensions": 512,
+    "max_tokens": 8192,
+    "language": ["en"],
+    "model_id": "jinaai/jina-embeddings-v2-small-en",
+    "model_revision": "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
+  },
+  {
+    "model_name": "jina-embeddings-v2-base-en",
+    "dimensions": 512,
+    "max_tokens": 8192,
+    "language": ["en"],
+    "model_id": "jinaai/jina-embeddings-v2-base-en",
+    "model_revision": "7302ac470bed880590f9344bfeee32ff8722d0e5"
   }
 ]

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -126,5 +126,21 @@
     "language": ["en"],
     "model_id": "Xorbits/bge-large-en-v1.5",
     "model_revision": "v0.0.1"
+  },
+    {
+    "model_name": "jina-embeddings-v2-small-en",
+    "dimensions": 512,
+    "max_tokens": 8192,
+    "language": ["en"],
+    "model_id": "Xorbits/jina-embeddings-v2-small-en",
+    "model_revision": "v0.0.1"
+  },
+  {
+    "model_name": "jina-embeddings-v2-base-en",
+    "dimensions": 512,
+    "max_tokens": 8192,
+    "language": ["en"],
+    "model_id": "Xorbits/jina-embeddings-v2-base-en",
+    "model_revision": "v0.0.1"
   }
 ]

xinference/model/llm/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ import os
 from .core import LLM
 from .llm_family import (
     BUILTIN_LLM_FAMILIES,
+    BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLM_CLASSES,
     GgmlLLMSpecV1,
@@ -89,13 +90,32 @@ def _install():
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
     )
     for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
-        BUILTIN_LLM_FAMILIES.append(LLMFamilyV1.parse_obj(json_obj))
+        model_spec = LLMFamilyV1.parse_obj(json_obj)
+        BUILTIN_LLM_FAMILIES.append(model_spec)
+        # register prompt style
+        if "chat" in model_spec.model_ability and isinstance(
+            model_spec.prompt_style, PromptStyleV1
+        ):
+            # note that the key is the model name,
+            # since there are multiple representations of the same prompt style name in json.
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
     modelscope_json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family_modelscope.json"
     )
     for json_obj in json.load(codecs.open(modelscope_json_path, "r", encoding="utf-8")):
-        BUILTIN_MODELSCOPE_LLM_FAMILIES.append(LLMFamilyV1.parse_obj(json_obj))
+        model_spec = LLMFamilyV1.parse_obj(json_obj)
+        BUILTIN_MODELSCOPE_LLM_FAMILIES.append(model_spec)
+        # register prompt style, in case that we have something missed
+        # if duplicated with huggingface json, keep it as the huggingface style
+        if (
+            "chat" in model_spec.model_ability
+            and isinstance(model_spec.prompt_style, PromptStyleV1)
+            and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
+        ):
+            BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = model_spec.prompt_style
     from ...constants import XINFERENCE_MODEL_DIR

xinference/model/llm/core.py CHANGED Viewed

@@ -61,13 +61,13 @@ class LLM(abc.ABC):
     @staticmethod
     def _has_cuda_device():
-        from xorbits._mars.resource import cuda_count
+        from ...utils import cuda_count
         return cuda_count() > 0
     @staticmethod
     def _get_cuda_count():
-        from xorbits._mars.resource import cuda_count
+        from ...utils import cuda_count
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
         if cuda_visible_devices is None:

xinference/model/llm/ggml/chatglm.py CHANGED Viewed

@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import logging
 import os
 import time
 import uuid
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union
 from ....types import (
     ChatCompletion,
@@ -107,7 +107,7 @@ class ChatglmCppChatModel(LLM):
     @staticmethod
     def _convert_raw_text_chunks_to_chat(
-        tokens: Iterator[str], model_name: str
+        tokens: Iterator[Any], model_name: str
     ) -> Iterator[ChatCompletionChunk]:
         yield {
             "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
@@ -124,7 +124,7 @@ class ChatglmCppChatModel(LLM):
                 }
             ],
         }
-        for token in enumerate(tokens):
+        for token in tokens:
             yield {
                 "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
                 "model": model_name,
@@ -134,30 +134,30 @@ class ChatglmCppChatModel(LLM):
                     {
                         "index": 0,
                         "delta": {
-                            "content": token[1],
+                            "content": token
+                            if isinstance(token, str)
+                            else token.content,
                         },
                         "finish_reason": None,
                     }
                 ],
             }
-    @staticmethod
+    @classmethod
     def _convert_raw_text_completion_to_chat(
-        text: str, model_name: str
+        cls, text: Any, model_name: str
     ) -> ChatCompletion:
+        _id = str(uuid.uuid4())
         return {
-            "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
+            "id": "chat" + f"cmpl-{_id}",
             "model": model_name,
             "object": "chat.completion",
             "created": int(time.time()),
             "choices": [
                 {
                     "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": text,
-                    },
-                    "finish_reason": None,
+                    "message": cls._message_to_json_string(_id, text),
+                    "finish_reason": cls._finish_reason_from_msg(text),
                 }
             ],
             "usage": {
@@ -167,6 +167,66 @@ class ChatglmCppChatModel(LLM):
             },
         }
+    @staticmethod
+    def _finish_reason_from_msg(msg):
+        if isinstance(msg, str):
+            return None
+        else:
+            return "tool_calls" if msg.tool_calls else "stop"
+    @staticmethod
+    def _eval_arguments(arguments):
+        def tool_call(**kwargs):
+            return kwargs
+        try:
+            return json.dumps(eval(arguments, dict(tool_call=tool_call)))
+        except Exception:
+            return f"Invalid arguments {arguments}"
+    @classmethod
+    def _message_to_json_string(cls, _id, msg) -> ChatCompletionMessage:
+        if isinstance(msg, str):
+            return {
+                "role": "assistant",
+                "content": msg,
+            }
+        else:
+            return {
+                "role": msg.role,
+                "content": msg.content,
+                "tool_calls": [
+                    {
+                        "id": f"call_{_id}",
+                        "type": tc.type,
+                        "function": {
+                            "name": tc.function.name,
+                            "arguments": cls._eval_arguments(tc.function.arguments),
+                        },
+                    }
+                    for tc in msg.tool_calls
+                ],
+            }
+    @staticmethod
+    def _handle_tools(generate_config) -> Optional[ChatCompletionMessage]:
+        """Convert openai tools to ChatGLM tools."""
+        if generate_config is None:
+            return None
+        tools = generate_config.pop("tools", None)
+        if tools is None:
+            return None
+        chatglm_tools = []
+        for elem in tools:
+            if elem.get("type") != "function" or "function" not in elem:
+                raise ValueError("ChatGLM tools only support function type.")
+            chatglm_tools.append(elem["function"])
+        return {
+            "role": "system",
+            "content": f"Answer the following questions as best as you can. You have access to the following tools:\n"
+            f"{json.dumps(chatglm_tools, indent=4, ensure_ascii=False)}",
+        }
     def chat(
         self,
         prompt: str,
@@ -174,11 +234,15 @@ class ChatglmCppChatModel(LLM):
         generate_config: Optional[ChatglmCppGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         if chat_history is not None:
-            chat_history_list = [message["content"] for message in chat_history]
+            chat_history_list = chat_history
         else:
             chat_history_list = []
-        chat_history_list.append(prompt)
+        tool_message = self._handle_tools(generate_config)
+        if tool_message is not None:
+            chat_history_list.insert(0, tool_message)
+        chat_history_list.append({"role": "user", "content": prompt})
         logger.debug("Full conversation history:\n%s", str(chat_history_list))
         generate_config = self._sanitize_generate_config(generate_config)

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -68,7 +68,7 @@ class LlamaCppModel(LLM):
         self._llm = None
     def _can_apply_metal(self):
-        return self.quantization in ["q4_0", "q4_1"]
+        return self.quantization.lower() in ["q4_0", "q4_1", "q4_k_s", "q4_k_m"]
     def _can_apply_cublas(self):
         # TODO: figure out the quantizations supported.
@@ -189,7 +189,7 @@ class LlamaCppModel(LLM):
         try:
             self._llm = Llama(
                 model_path=model_path,
-                verbose=False,
+                verbose=True,
                 **self._llamacpp_model_config,
             )
         except AssertionError:

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -512,7 +512,7 @@
           "none"
         ],
         "model_id": "THUDM/chatglm3-6b",
-        "model_revision": "fc3235f807ef5527af598c05f04f2ffd17f48bab"
+        "model_revision": "e46a14881eae613281abbd266ee918e93a56018f"
       }
     ],
     "prompt_style": {
@@ -1136,6 +1136,17 @@
         "model_id": "Qwen/Qwen-14B-Chat",
         "model_revision": "fab8385c8f7e7980ef61944729fe134ccbbca263"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen-72B-Chat",
+        "model_revision": "2cd9f76279337941ec1a4abeec6f8eb3c38d0f55"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 7,
@@ -1153,6 +1164,15 @@
           "Int8"
         ],
         "model_id": "Qwen/Qwen-14B-Chat-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen-72B-Chat-{quantization}"
       }
     ],
     "prompt_style": {
@@ -1164,7 +1184,14 @@
       ],
       "intra_message_sep": "\n",
       "stop_token_ids": [
-        151643
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
       ]
     }
   },
@@ -2077,7 +2104,7 @@
     "model_ability": [
       "generate"
     ],
-    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time.",
+    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
     "model_specs": [
       {
         "model_format": "ggufv2",
@@ -2134,7 +2161,7 @@
     "model_ability": [
       "generate"
     ],
-    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI. The first public release contains two bilingual (English/Chinese) base models with the parameter sizes of 6B and 34B. Both of them are trained with 4K sequence length and can be extended to 32K during inference time.",
+    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
     "model_specs": [
       {
         "model_format": "pytorch",
@@ -2160,6 +2187,74 @@
       }
     ]
   },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "Yi-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-34B-Chat",
+        "model_revision": "a99ec35331cbfc9da596af7d4538fe2efecff03c"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/Yi-34B-Chat-GGUF",
+        "model_file_name_template": "yi-34b-chat.{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -17,9 +17,13 @@ import os
 import platform
 import shutil
 from threading import Lock
-from typing import List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, Protocol, ValidationError
+from pydantic.error_wrappers import ErrorWrapper
+from pydantic.parse import load_str_bytes
+from pydantic.types import StrBytes
+from pydantic.utils import ROOT_KEY
 from typing_extensions import Annotated, Literal
 from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
@@ -36,6 +40,7 @@ from . import LLM
 logger = logging.getLogger(__name__)
 DEFAULT_CONTEXT_LENGTH = 2048
+BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
 class GgmlLLMSpecV1(BaseModel):
@@ -80,12 +85,52 @@ class LLMFamilyV1(BaseModel):
     prompt_style: Optional["PromptStyleV1"]
+class CustomLLMFamilyV1(LLMFamilyV1):
+    prompt_style: Optional[Union["PromptStyleV1", str]]  # type: ignore
+    @classmethod
+    def parse_raw(
+        cls: Any,
+        b: StrBytes,
+        *,
+        content_type: Optional[str] = None,
+        encoding: str = "utf8",
+        proto: Protocol = None,
+        allow_pickle: bool = False,
+    ) -> LLMFamilyV1:
+        # See source code of BaseModel.parse_raw
+        try:
+            obj = load_str_bytes(
+                b,
+                proto=proto,
+                content_type=content_type,
+                encoding=encoding,
+                allow_pickle=allow_pickle,
+                json_loads=cls.__config__.json_loads,
+            )
+        except (ValueError, TypeError, UnicodeDecodeError) as e:
+            raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
+        llm_spec = cls.parse_obj(obj)
+        # handle prompt style when user choose existing style
+        if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str):
+            prompt_style_name = llm_spec.prompt_style
+            if prompt_style_name not in BUILTIN_LLM_PROMPT_STYLE:
+                raise ValueError(
+                    f"Xinference does not support the prompt style name: {prompt_style_name}"
+                )
+            llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
+        return llm_spec
 LLMSpecV1 = Annotated[
     Union[GgmlLLMSpecV1, PytorchLLMSpecV1],
     Field(discriminator="model_format"),
 ]
 LLMFamilyV1.update_forward_refs()
+CustomLLMFamilyV1.update_forward_refs()
 LLM_CLASSES: List[Type[LLM]] = []
@@ -580,7 +625,7 @@ def _is_linux():
 def _has_cuda_device():
     # `cuda_count` method already contains the logic for the
     # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
-    from xorbits._mars.resource import cuda_count
+    from ...utils import cuda_count
     return cuda_count() > 0
@@ -677,11 +722,6 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
             f" or a digit, and can only contain letters, digits, underscores, or dashes."
         )
-    for spec in llm_family.model_specs:
-        model_uri = spec.model_uri
-        if model_uri and not is_valid_model_uri(model_uri):
-            raise ValueError(f"Invalid model URI {model_uri}.")
     with UD_LLM_FAMILIES_LOCK:
         for family in BUILTIN_LLM_FAMILIES + UD_LLM_FAMILIES:
             if llm_family.model_name == family.model_name:
@@ -692,6 +732,12 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
         UD_LLM_FAMILIES.append(llm_family)
     if persist:
+        # We only validate model URL when persist is True.
+        for spec in llm_family.model_specs:
+            model_uri = spec.model_uri
+            if model_uri and not is_valid_model_uri(model_uri):
+                raise ValueError(f"Invalid model URI {model_uri}.")
         persist_path = os.path.join(
             XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
         )

xinference 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

xinference 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl