PyPI - xinference - Versions diffs - 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

xinference 0.11.3py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (30) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +69 -0
xinference/client/restful/restful_client.py +70 -0
xinference/constants.py +4 -0
xinference/core/model.py +141 -12
xinference/core/scheduler.py +428 -0
xinference/core/supervisor.py +26 -0
xinference/isolation.py +9 -2
xinference/model/audio/chattts.py +84 -0
xinference/model/audio/core.py +10 -3
xinference/model/audio/model_spec.json +20 -0
xinference/model/llm/__init__.py +4 -0
xinference/model/llm/llm_family.json +507 -1
xinference/model/llm/llm_family_modelscope.json +409 -2
xinference/model/llm/pytorch/chatglm.py +2 -1
xinference/model/llm/pytorch/cogvlm2.py +76 -17
xinference/model/llm/pytorch/core.py +91 -6
xinference/model/llm/pytorch/glm4v.py +258 -0
xinference/model/llm/pytorch/minicpmv25.py +232 -0
xinference/model/llm/pytorch/utils.py +386 -2
xinference/model/llm/vllm/core.py +6 -0
xinference/thirdparty/ChatTTS/__init__.py +1 -0
xinference/thirdparty/ChatTTS/core.py +200 -0
xinference/types.py +3 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/METADATA +26 -9
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/RECORD +30 -24
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/LICENSE +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/WHEEL +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/entry_points.txt +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/top_level.txt +0 -0

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -4,6 +4,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-tiny",
     "model_revision": "167c219b21f11ef214220b8fdb7536b8a88c2475",
+    "ability": "audio-to-text",
     "multilingual": true
   },
   {
@@ -11,6 +12,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-tiny.en",
     "model_revision": "87c7102498dcde7456f24cfd30239ca606ed9063",
+    "ability": "audio-to-text",
     "multilingual": false
   },
   {
@@ -18,6 +20,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-base",
     "model_revision": "8c1db9b51951100007a96a525d83a8ec81b3c237",
+    "ability": "audio-to-text",
     "multilingual": true
   },
   {
@@ -25,6 +28,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-base.en",
     "model_revision": "911407f4214e0e1d82085af863093ec0b66f9cd6",
+    "ability": "audio-to-text",
     "multilingual": false
   },
   {
@@ -32,6 +36,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-small",
     "model_revision": "998cb1a777c20db53d6033a61b977ed4c3792cac",
+    "ability": "audio-to-text",
     "multilingual": true
   },
   {
@@ -39,6 +44,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-small.en",
     "model_revision": "e8727524f962ee844a7319d92be39ac1bd25655a",
+    "ability": "audio-to-text",
     "multilingual": false
   },
   {
@@ -46,6 +52,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-medium",
     "model_revision": "16688beb1294bedd0a6f5cd86fe7eec57bce41ed",
+    "ability": "audio-to-text",
     "multilingual": true
   },
   {
@@ -53,6 +60,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-medium.en",
     "model_revision": "2e98eb6279edf5095af0c8dedb36bdec0acd172b",
+    "ability": "audio-to-text",
     "multilingual": false
   },
   {
@@ -60,6 +68,7 @@
     "model_family": "whisper",
     "model_id": "openai/whisper-large-v3",
     "model_revision": "6cdf07a7e3ec3806e5d55f787915b85d4cd020b1",
+    "ability": "audio-to-text",
     "multilingual": true
   },
   {
@@ -67,6 +76,7 @@
     "model_family": "whisper",
     "model_id": "BELLE-2/Belle-distilwhisper-large-v2-zh",
     "model_revision": "ed25d13498fa5bac758b2fc479435b698532dfe8",
+    "ability": "audio-to-text",
     "multilingual": false
   },
   {
@@ -74,6 +84,7 @@
     "model_family": "whisper",
     "model_id": "BELLE-2/Belle-whisper-large-v2-zh",
     "model_revision": "ec5bd5d78598545b7585814edde86dac2002b5b9",
+    "ability": "audio-to-text",
     "multilingual": false
   },
   {
@@ -81,6 +92,15 @@
     "model_family": "whisper",
     "model_id": "BELLE-2/Belle-whisper-large-v3-zh",
     "model_revision": "3bebc7247696b39f5ab9ed22db426943ac33f600",
+    "ability": "audio-to-text",
     "multilingual": false
+  },
+  {
+    "model_name": "ChatTTS",
+    "model_family": "ChatTTS",
+    "model_id": "2Noise/ChatTTS",
+    "model_revision": "ce5913842aebd78e4a01a02d47244b8d62ac4ee3",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/llm/__init__.py CHANGED Viewed

@@ -117,9 +117,11 @@ def _install():
     from .pytorch.core import PytorchChatModel, PytorchModel
     from .pytorch.deepseek_vl import DeepSeekVLChatModel
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
+    from .pytorch.glm4v import Glm4VModel
     from .pytorch.intern_vl import InternVLChatModel
     from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
+    from .pytorch.minicpmv25 import MiniCPMV25Model
     from .pytorch.qwen_vl import QwenVLChatModel
     from .pytorch.vicuna import VicunaPytorchChatModel
     from .pytorch.yi_vl import YiVLChatModel
@@ -161,6 +163,8 @@ def _install():
             InternVLChatModel,
             PytorchModel,
             CogVLM2Model,
+            MiniCPMV25Model,
+            Glm4VModel,
         ]
     )
     if OmniLMMModel:  # type: ignore

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -831,6 +831,139 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "glm4-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/glm-4-9b-chat",
+        "model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 1048576,
+    "model_name": "glm4-chat-1m",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/glm-4-9b-chat-1m",
+        "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "glm-4v",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "THUDM/glm-4v-9b",
+        "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -2291,6 +2424,218 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2 is the new series of Qwen large language models",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-moe-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen2 is the new series of Qwen large language models. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-57B-A14B-Instruct"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,
@@ -3251,6 +3596,125 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "mistral-instruct-v0.3",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+        "model_revision": "83e9aa141f2e28c82232fea5325f54edf17c43de"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "solidrust/Mistral-7B-Instruct-v0.3-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_S",
+          "Q3_K_M",
+          "Q3_K_L",
+          "Q4_K_S",
+          "Q4_K_M",
+          "Q5_K_S",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "fp16"
+        ],
+        "model_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
+        "model_file_name_template": "Mistral-7B-Instruct-v0.3.{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA2",
+      "system_prompt": "[INST] ",
+      "roles": [
+        "[INST]",
+        "[/INST]"
+      ],
+      "intra_message_sep": " ",
+      "inter_message_sep": "<s>",
+      "stop_token_ids": [
+        2
+      ],
+      "stop": [
+        "</s>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "codestral-v0.1",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Codestrall-22B-v0.1 is trained on a diverse dataset of 80+ programming languages, including the most popular ones, such as Python, Java, C, C++, JavaScript, and Bash",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 22,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
+        "model_revision": "9552e7b1d9b2d5bbd87a5aa7221817285dbb6366"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 22,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_S",
+          "Q3_K_M",
+          "Q3_K_L",
+          "Q4_K_S",
+          "Q4_K_M",
+          "Q5_K_S",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "bartowski/Codestral-22B-v0.1-GGUF",
+        "model_file_name_template": "Codestral-22B-v0.1-{quantization}.gguf"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 8192,
@@ -5258,6 +5722,48 @@
       ]
     }
   },
+  {
+    "version":1,
+    "context_length":2048,
+    "model_name":"MiniCPM-Llama3-V-2_5",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"openbmb/MiniCPM-Llama3-V-2_5",
+        "model_revision":"285a637ba8a30a0660dfcccad16f9a864f75abfd"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "int4"
+        ],
+        "model_id":"openbmb/MiniCPM-Llama3-V-2_5-{quantization}",
+        "model_revision":"f92aff28552de35de3be204e8fe292dd4824e544"
+      }
+    ],
+    "prompt_style":{
+      "style_name":"OmniLMM",
+      "system_prompt":"The role of first msg should be user",
+      "roles":[
+        "user",
+        "assistant"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -6277,7 +6783,7 @@
         "quantizations": [
           "int4"
         ],
-        "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantizations}",
+        "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantization}",
         "model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f"
       }
     ],

xinference 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

xinference 0.11.3py3-none-any.whl → 0.12.0py3-none-any.whl