PyPI - xinference - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

xinference 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (95) hide show

xinference/model/llm/core.py CHANGED Viewed

@@ -17,7 +17,8 @@ import logging
 import os
 import platform
 from abc import abstractmethod
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 from ...core.utils import parse_replica_model_uid
 from ..core import ModelDescription
@@ -28,6 +29,15 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
+LLM_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
+def get_llm_model_descriptions():
+    import copy
+    return copy.deepcopy(LLM_MODEL_DESCRIPTIONS)
 class LLM(abc.ABC):
     def __init__(
         self,
@@ -107,8 +117,9 @@ class LLMDescription(ModelDescription):
         llm_family: "LLMFamilyV1",
         llm_spec: "LLMSpecV1",
         quantization: Optional[str],
+        model_path: Optional[str] = None,
     ):
-        super().__init__(address, devices)
+        super().__init__(address, devices, model_path=model_path)
         self._llm_family = llm_family
         self._llm_spec = llm_spec
         self._quantization = quantization
@@ -124,12 +135,42 @@ class LLMDescription(ModelDescription):
             "model_description": self._llm_family.model_description,
             "model_format": self._llm_spec.model_format,
             "model_size_in_billions": self._llm_spec.model_size_in_billions,
+            "model_family": self._llm_family.model_family
+            or self._llm_family.model_name,
             "quantization": self._quantization,
             "model_hub": self._llm_spec.model_hub,
             "revision": self._llm_spec.model_revision,
             "context_length": self._llm_family.context_length,
         }
+    def to_version_info(self):
+        from .utils import get_file_location, get_model_version
+        model_file_location, cache_status = get_file_location(
+            self._llm_family, self._llm_spec, self._quantization
+        )
+        return {
+            "model_version": get_model_version(
+                self._llm_family, self._llm_spec, self._quantization
+            ),
+            "model_file_location": model_file_location,
+            "cache_status": cache_status,
+            "quantization": self._quantization,
+            "model_format": self._llm_spec.model_format,
+            "model_size_in_billions": self._llm_spec.model_size_in_billions,
+        }
+def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
+    res = defaultdict(list)
+    for spec in llm_family.model_specs:
+        for q in spec.quantizations:
+            res[llm_family.model_name].append(
+                LLMDescription(None, None, llm_family, spec, q).to_version_info()
+            )
+    return res
 def create_llm_model_instance(
     subpool_addr: str,

xinference/model/llm/ggml/chatglm.py CHANGED Viewed

@@ -230,20 +230,28 @@ class ChatglmCppChatModel(LLM):
             ),
         }
+    @staticmethod
+    def _to_chatglm_chat_messages(history_list: List[Any]):
+        from chatglm_cpp import ChatMessage
+        return [ChatMessage(role=v["role"], content=v["content"]) for v in history_list]
     def chat(
         self,
         prompt: str,
+        system_prompt: Optional[str] = None,
         chat_history: Optional[List[ChatCompletionMessage]] = None,
         generate_config: Optional[ChatglmCppGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        chat_history_list = []
+        if system_prompt is not None:
+            chat_history_list.append({"role": "system", "content": system_prompt})
         if chat_history is not None:
-            chat_history_list = chat_history
-        else:
-            chat_history_list = []
+            chat_history_list.extend(chat_history)  # type: ignore
         tool_message = self._handle_tools(generate_config)
         if tool_message is not None:
-            chat_history_list.insert(0, tool_message)
+            chat_history_list.insert(0, tool_message)  # type: ignore
         # We drop the message which contains tool calls to walkaround the issue:
         # https://github.com/li-plus/chatglm.cpp/issues/231
@@ -276,17 +284,18 @@ class ChatglmCppChatModel(LLM):
         params = {k: v for k, v in params.items() if v is not None}
         assert self._llm is not None
+        chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
         if generate_config["stream"]:
             it = self._llm.chat(
-                chat_history_list,
+                chat_history_messages,
                 **params,
             )
             assert not isinstance(it, str)
             return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
         else:
             c = self._llm.chat(
-                chat_history_list,
+                chat_history_messages,
                 **params,
             )
             assert not isinstance(c, Iterator)

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -2361,6 +2361,15 @@
         "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "model_revision": "125c431e2ff41a156b9f9076f744d2f35dd6e67a"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "46_7",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
+        "model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "46_7",
@@ -3184,7 +3193,7 @@
           "none"
         ],
         "model_id": "internlm/internlm2-chat-7b",
-        "model_revision": "5797f79825bab7013932d57e2babaac1b8de6b4f"
+        "model_revision": "2292b86b21cb856642782cebed0a453997453b1f"
       },
       {
         "model_format": "pytorch",
@@ -3193,22 +3202,204 @@
           "none"
         ],
         "model_id": "internlm/internlm2-chat-20b",
-        "model_revision": "3ccaf3ae82d5d01c0a95eecf40ee550f9c543635"
+        "model_revision": "b666125047cd98c5a7c85ca28720b44a06aed124"
       }
     ],
     "prompt_style": {
       "style_name": "INTERNLM2",
       "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
       "roles": [
-        "[UNUSED_TOKEN_146]user",
-        "[UNUSED_TOKEN_146]assistant"
+        "<|im_start|>user",
+        "<|im_start|>assistant"
       ],
-      "intra_message_sep": "[UNUSED_TOKEN_145]",
+      "intra_message_sep": "<|im_end|>",
       "stop_token_ids": [
         92542
       ],
       "stop": [
-        "[UNUSED_TOKEN_145]"
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "qwen-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Qwen/Qwen-VL-Chat",
+        "model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
+        "model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat",
+        "model_revision": "ea6fb9b7e1917f3693935accbeb0bfecfd6552a7"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat-rag",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat-RAG",
+        "model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "yi-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "01-ai/Yi-VL-6B",
+        "model_revision": "897c938da1ec860330e2ba2d425ab3004495ba38"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "01-ai/Yi-VL-34B",
+        "model_revision": "ea29a9a430f27893e780366dae81d4ca5ebab561"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
       ]
     }
   }

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -70,7 +70,7 @@ class GgmlLLMSpecV1(BaseModel):
 class PytorchLLMSpecV1(BaseModel):
-    model_format: Literal["pytorch", "gptq"]
+    model_format: Literal["pytorch", "gptq", "awq"]
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
     quantizations: List[str]
@@ -106,7 +106,7 @@ class LLMFamilyV1(BaseModel):
     context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
     model_name: str
     model_lang: List[str]
-    model_ability: List[Literal["embed", "generate", "chat", "tools"]]
+    model_ability: List[Literal["embed", "generate", "chat", "tools", "vision"]]
     model_description: Optional[str]
     # reason for not required str here: legacy registration
     model_family: Optional[str]
@@ -212,6 +212,8 @@ UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
 UD_LLM_FAMILIES_LOCK = Lock()
+LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
 def download_from_self_hosted_storage() -> bool:
     from ...constants import XINFERENCE_ENV_MODEL_SRC
@@ -449,7 +451,7 @@ def _get_meta_path(
             return os.path.join(cache_dir, "__valid_download")
         else:
             return os.path.join(cache_dir, f"__valid_download_{model_hub}")
-    elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
         assert quantization is not None
         if model_hub == "huggingface":
             return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -487,7 +489,7 @@ def _skip_download(
                     logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
                     return True
             return False
-    elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
         assert quantization is not None
         return os.path.exists(
             _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -535,7 +537,7 @@ def cache_from_modelscope(
     ):
         return cache_dir
-    if llm_spec.model_format in ["pytorch", "gptq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
         download_dir = retry_download(
             snapshot_download,
             llm_family.model_name,
@@ -596,7 +598,7 @@ def cache_from_huggingface(
     ):
         return cache_dir
-    if llm_spec.model_format in ["pytorch", "gptq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
         assert isinstance(llm_spec, PytorchLLMSpecV1)
         retry_download(
             huggingface_hub.snapshot_download,
@@ -677,7 +679,7 @@ def get_cache_status(
         ]
         return any(revisions)
     # just check meta file for ggml and gptq model
-    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq"]:
+    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
         ret = []
         for q in llm_spec.quantizations:
             assert q is not None

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -1817,15 +1817,200 @@
       "style_name": "INTERNLM2",
       "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
       "roles": [
-        "[UNUSED_TOKEN_146]user",
-        "[UNUSED_TOKEN_146]assistant"
+        "<|im_start|>user",
+        "<|im_start|>assistant"
       ],
-      "intra_message_sep": "[UNUSED_TOKEN_145]",
+      "intra_message_sep": "<|im_end|>",
       "stop_token_ids": [
         92542
       ],
       "stop": [
-        "[UNUSED_TOKEN_145]"
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "qwen-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "Qwen/Qwen-VL-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat-rag",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OrionStarAI/Orion-14B-Chat-RAG"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "yi-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Yi Vision Language (Yi-VL) model is the open-source, multimodal version of the Yi Large Language Model (LLM) series, enabling content comprehension, recognition, and multi-round conversations about images.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-VL-6B"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-VL-34B"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
       ]
     }
   }

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -120,9 +120,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         top_p = generate_config.get("top_p")
         if top_p is not None:
             kwargs["top_p"] = float(top_p)
-        max_length = generate_config.get("max_tokens")
-        if max_length is not None:
-            kwargs["max_length"] = int(max_length)
+        max_new_tokens = generate_config.get("max_tokens")
+        if max_new_tokens is not None:
+            kwargs["max_new_tokens"] = int(max_new_tokens)
         # Tool calls only works for non stream, so we call chat directly.
         if prompt == SPECIAL_TOOL_PROMPT and chat_history:
             tool_message = chat_history.pop()

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -190,7 +190,7 @@ class PytorchModel(LLM):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in [
@@ -408,7 +408,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         if llm_family.model_name in [
             "baichuan-chat",
@@ -422,6 +422,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             "llama-2",
             "llama-2-chat",
             "internlm2-chat",
+            "qwen-vl-chat",
+            "yi-vl-chat",
         ]:
             return False
         if "chat" not in llm_family.model_ability:

xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

Potentially problematic release.

xinference 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl