PyPI - xinference - Versions diffs - 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show

xinference/model/embedding/model_spec_modelscope.json CHANGED Viewed

@@ -5,7 +5,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/bge-large-en",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-base-en",
@@ -13,7 +14,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/bge-base-en",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "gte-large",
@@ -21,7 +23,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/gte-large",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "gte-base",
@@ -29,7 +32,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/gte-base",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "e5-large-v2",
@@ -37,7 +41,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/e5-large-v2",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-large-zh",
@@ -45,7 +50,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-large-zh",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-large-zh-noinstruct",
@@ -53,7 +59,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-large-zh-noinstruct",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-base-zh",
@@ -61,7 +68,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-base-zh",
-    "model_revision": "v0.0.2"
+    "model_revision": "v0.0.2",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "multilingual-e5-large",
@@ -69,7 +77,8 @@
     "max_tokens": 514,
     "language": ["zh"],
     "model_id": "Xorbits/multilingual-e5-large",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-small-zh",
@@ -77,7 +86,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-small-zh",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-small-zh-v1.5",
@@ -85,7 +95,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-small-zh-v1.5",
-    "model_revision": "v0.0.2"
+    "model_revision": "v0.0.2",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-base-zh-v1.5",
@@ -93,7 +104,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-base-zh-v1.5",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-large-zh-v1.5",
@@ -101,7 +113,8 @@
     "max_tokens": 512,
     "language": ["zh"],
     "model_id": "Xorbits/bge-large-zh-v1.5",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-small-en-v1.5",
@@ -109,7 +122,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/bge-small-en-v1.5",
-    "model_revision": "v0.0.2"
+    "model_revision": "v0.0.2",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-base-en-v1.5",
@@ -117,7 +131,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/bge-base-en-v1.5",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "bge-large-en-v1.5",
@@ -125,7 +140,8 @@
     "max_tokens": 512,
     "language": ["en"],
     "model_id": "Xorbits/bge-large-en-v1.5",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
     {
     "model_name": "jina-embeddings-v2-small-en",
@@ -133,7 +149,8 @@
     "max_tokens": 8192,
     "language": ["en"],
     "model_id": "Xorbits/jina-embeddings-v2-small-en",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
   },
   {
     "model_name": "jina-embeddings-v2-base-en",
@@ -141,6 +158,31 @@
     "max_tokens": 8192,
     "language": ["en"],
     "model_id": "Xorbits/jina-embeddings-v2-base-en",
-    "model_revision": "v0.0.1"
+    "model_revision": "v0.0.1",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "text2vec-large-chinese",
+    "dimensions": 1024,
+    "max_tokens": 256,
+    "language": ["zh"],
+    "model_id": "Jerry0/text2vec-large-chinese",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "text2vec-base-chinese",
+    "dimensions": 768,
+    "max_tokens": 128,
+    "language": ["zh"],
+    "model_id": "Jerry0/text2vec-base-chinese",
+    "model_hub": "modelscope"
+  },
+  {
+    "model_name": "text2vec-base-chinese-paraphrase",
+    "dimensions": 768,
+    "max_tokens": 256,
+    "language": ["zh"],
+    "model_id": "mwei23/text2vec-base-chinese-paraphrase",
+    "model_hub": "modelscope"
   }
 ]

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import base64
 import logging
 import os
+import re
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
@@ -101,7 +102,7 @@ class DiffusionModel:
             def _gen_base64_image(_img):
                 buffered = BytesIO()
                 _img.save(buffered, format="jpeg")
-                return base64.b64encode(buffered.getvalue())
+                return base64.b64encode(buffered.getvalue()).decode()
             with ThreadPoolExecutor() as executor:
                 results = list(map(partial(executor.submit, _gen_base64_image), images))
@@ -120,7 +121,7 @@ class DiffusionModel:
     ):
         # References:
         # https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_sdxl
-        width, height = map(int, size.split("*"))
+        width, height = map(int, re.split(r"[^\d]+", size))
         return self._call_model(
             prompt=prompt,
             height=height,
@@ -140,7 +141,7 @@ class DiffusionModel:
         response_format: str = "url",
         **kwargs,
     ):
-        width, height = map(int, size.split("*"))
+        width, height = map(int, re.split(r"[^\d]+", size))
         return self._call_model(
             image=image,
             prompt=prompt,

xinference/model/llm/__init__.py CHANGED Viewed

@@ -21,6 +21,7 @@ from .llm_family import (
     BUILTIN_LLM_FAMILIES,
     BUILTIN_LLM_MODEL_CHAT_FAMILIES,
     BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
+    BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
     BUILTIN_LLM_PROMPT_STYLE,
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLM_CLASSES,
@@ -47,6 +48,7 @@ def _install():
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.core import PytorchChatModel, PytorchModel
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
+    from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
     from .pytorch.vicuna import VicunaPytorchChatModel
     from .vllm.core import VLLMChatModel, VLLMModel
@@ -79,6 +81,7 @@ def _install():
             LlamaPytorchChatModel,
             PytorchChatModel,
             FalconPytorchModel,
+            Internlm2PytorchChatModel,
             PytorchModel,
         ]
     )
@@ -102,6 +105,8 @@ def _install():
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
         else:
             BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
+        if "tool_call" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
     modelscope_json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family_modelscope.json"
@@ -123,6 +128,8 @@ def _install():
             BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
         else:
             BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
+        if "tool_call" in model_spec.model_ability:
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
     from ...constants import XINFERENCE_MODEL_DIR

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -306,7 +306,8 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
         generate_config = self._sanitize_generate_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
-        if tools and self.model_family.model_name == "qwen-chat":
+        model_family = self.model_family.model_family or self.model_family.model_name
+        if tools and "qwen-chat" == model_family:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
@@ -326,6 +327,6 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
             assert not isinstance(c, Iterator)
             if tools:
                 return self._tool_calls_completion(
-                    self.model_family.model_name, self.model_uid, c, tools
+                    self.model_family, self.model_uid, c, tools
                 )
             return self._to_chat_completion(c)

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -535,7 +535,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
@@ -609,6 +610,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },
@@ -1139,14 +1149,15 @@
   },
   {
     "version": 1,
-    "context_length": 2048,
+    "context_length": 32768,
     "model_name": "qwen-chat",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
@@ -1172,6 +1183,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": "1_8",
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-1_8B-Chat",
@@ -1181,6 +1194,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 7,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-7B-Chat",
@@ -1190,6 +1205,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 14,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-14B-Chat",
@@ -1199,6 +1216,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 72,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "Qwen/Qwen-72B-Chat",
@@ -1213,6 +1232,15 @@
         ],
         "model_id": "Qwen/Qwen-7B-Chat-{quantization}"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen-1_8B-Chat-{quantization}"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 14,
@@ -2468,6 +2496,14 @@
     ],
     "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
     "model_specs": [
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "8bits"
+        ],
+        "model_id": "01-ai/Yi-34B-Chat-{quantization}"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 34,
@@ -3127,5 +3163,53 @@
         "model_revision": "70d1740208c8ba39f9ba250b22117ec25311ab33"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "internlm2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The second generation of the InternLM model, InternLM2.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "internlm/internlm2-chat-7b",
+        "model_revision": "5797f79825bab7013932d57e2babaac1b8de6b4f"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "internlm/internlm2-chat-20b",
+        "model_revision": "3ccaf3ae82d5d01c0a95eecf40ee550f9c543635"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "[UNUSED_TOKEN_146]user",
+        "[UNUSED_TOKEN_146]assistant"
+      ],
+      "intra_message_sep": "[UNUSED_TOKEN_145]",
+      "stop_token_ids": [
+        92542
+      ],
+      "stop": [
+        "[UNUSED_TOKEN_145]"
+      ]
+    }
   }
 ]

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -43,6 +43,7 @@ DEFAULT_CONTEXT_LENGTH = 2048
 BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
 BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
 BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
+BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
 class GgmlLLMSpecV1(BaseModel):
@@ -50,7 +51,7 @@ class GgmlLLMSpecV1(BaseModel):
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
     quantizations: List[str]
-    model_id: str
+    model_id: Optional[str]
     model_file_name_template: str
     model_hub: str = "huggingface"
     model_uri: Optional[str]
@@ -73,7 +74,7 @@ class PytorchLLMSpecV1(BaseModel):
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
     quantizations: List[str]
-    model_id: str
+    model_id: Optional[str]
     model_hub: str = "huggingface"
     model_uri: Optional[str]
     model_revision: Optional[str]
@@ -105,7 +106,7 @@ class LLMFamilyV1(BaseModel):
     context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
     model_name: str
     model_lang: List[str]
-    model_ability: List[Literal["embed", "generate", "chat"]]
+    model_ability: List[Literal["embed", "generate", "chat", "tools"]]
     model_description: Optional[str]
     # reason for not required str here: legacy registration
     model_family: Optional[str]
@@ -155,6 +156,15 @@ class CustomLLMFamilyV1(LLMFamilyV1):
                 f"`model_family` for chat model must be `other` or one of the following values: \n"
                 f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
             )
+        if (
+            llm_spec.model_family != "other"
+            and "tool_call" in llm_spec.model_ability
+            and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
+        ):
+            raise ValueError(
+                f"`model_family` for tool call model must be `other` or one of the following values: \n"
+                f"{', '.join(list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES))}"
+            )
         if (
             llm_spec.model_family != "other"
             and "chat" not in llm_spec.model_ability
@@ -413,7 +423,7 @@ def _get_cache_dir(
     # quantization a dedicated cache dir.
     quant_suffix = ""
     for q in llm_spec.quantizations:
-        if q in llm_spec.model_id:
+        if llm_spec.model_id and q in llm_spec.model_id:
             quant_suffix = q
             break
     cache_dir_name = (
@@ -726,7 +736,7 @@ def match_llm(
     def _apply_format_to_model_id(spec: LLMSpecV1, q: str) -> LLMSpecV1:
         # Different quantized versions of some models use different model ids,
         # Here we check the `{}` in the model id to format the id.
-        if "{" in spec.model_id:
+        if spec.model_id and "{" in spec.model_id:
             spec.model_id = spec.model_id.format(quantization=q)
         return spec

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -297,7 +297,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
@@ -375,6 +376,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },
@@ -1108,6 +1118,15 @@
     ],
     "model_description": "The Yi series models are large language models trained from scratch by developers at 01.AI.",
     "model_specs": [
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "8bits"
+        ],
+        "model_id": "01ai/Yi-34B-Chat-{quantization}",
+        "model_revision": "master"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 34,
@@ -1452,14 +1471,15 @@
   },
   {
     "version": 1,
-    "context_length": 2048,
+    "context_length": 32768,
     "model_name": "qwen-chat",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
     "model_specs": [
@@ -1489,6 +1509,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": "1_8",
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -1499,6 +1521,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 7,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -1509,6 +1533,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 72,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -1519,12 +1545,25 @@
         "model_format": "pytorch",
         "model_size_in_billions": 14,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_id": "qwen/Qwen-14B-Chat",
         "model_hub": "modelscope",
         "model_revision": "v1.0.7"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_8",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen-1_8B-Chat-{quantization}",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 7,
@@ -1739,5 +1778,55 @@
         "model_revision": "master"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 204800,
+    "model_name": "internlm2-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The second generation of the InternLM model, InternLM2.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm2-chat-7b",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm2-chat-20b",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "[UNUSED_TOKEN_146]user",
+        "[UNUSED_TOKEN_146]assistant"
+      ],
+      "intra_message_sep": "[UNUSED_TOKEN_145]",
+      "stop_token_ids": [
+        92542
+      ],
+      "stop": [
+        "[UNUSED_TOKEN_145]"
+      ]
+    }
   }
 ]

xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl