PyPI - xinference - Versions diffs - 0.11.2.post1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

xinference 0.11.2.post1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (36) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +83 -8
xinference/client/restful/restful_client.py +70 -0
xinference/constants.py +8 -0
xinference/core/__init__.py +0 -2
xinference/core/cache_tracker.py +22 -1
xinference/core/chat_interface.py +71 -10
xinference/core/model.py +141 -12
xinference/core/scheduler.py +428 -0
xinference/core/supervisor.py +31 -3
xinference/core/worker.py +8 -3
xinference/isolation.py +9 -2
xinference/model/audio/chattts.py +84 -0
xinference/model/audio/core.py +10 -3
xinference/model/audio/model_spec.json +20 -0
xinference/model/llm/__init__.py +6 -0
xinference/model/llm/llm_family.json +1063 -260
xinference/model/llm/llm_family_modelscope.json +686 -13
xinference/model/llm/pytorch/baichuan.py +2 -1
xinference/model/llm/pytorch/chatglm.py +2 -1
xinference/model/llm/pytorch/cogvlm2.py +316 -0
xinference/model/llm/pytorch/core.py +92 -6
xinference/model/llm/pytorch/glm4v.py +258 -0
xinference/model/llm/pytorch/intern_vl.py +5 -10
xinference/model/llm/pytorch/minicpmv25.py +232 -0
xinference/model/llm/pytorch/utils.py +386 -2
xinference/model/llm/vllm/core.py +7 -1
xinference/thirdparty/ChatTTS/__init__.py +1 -0
xinference/thirdparty/ChatTTS/core.py +200 -0
xinference/types.py +3 -0
{xinference-0.11.2.post1.dist-info → xinference-0.12.0.dist-info}/METADATA +28 -11
{xinference-0.11.2.post1.dist-info → xinference-0.12.0.dist-info}/RECORD +36 -29
{xinference-0.11.2.post1.dist-info → xinference-0.12.0.dist-info}/LICENSE +0 -0
{xinference-0.11.2.post1.dist-info → xinference-0.12.0.dist-info}/WHEEL +0 -0
{xinference-0.11.2.post1.dist-info → xinference-0.12.0.dist-info}/entry_points.txt +0 -0
{xinference-0.11.2.post1.dist-info → xinference-0.12.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -831,6 +831,139 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "glm4-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/glm-4-9b-chat",
+        "model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 1048576,
+    "model_name": "glm4-chat-1m",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/glm-4-9b-chat-1m",
+        "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "glm-4v",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "THUDM/glm-4v-9b",
+        "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -2293,176 +2426,388 @@
   },
   {
     "version": 1,
-    "context_length": 8192,
-    "model_name": "starcoder",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "generate"
-    ],
-    "model_description": "Starcoder is an open-source Transformer based LLM that is trained on permissively licensed data from GitHub.",
-    "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 16,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_id": "TheBloke/starcoder-GGML",
-        "model_file_name_template": "starcoder.ggmlv3.{quantization}.bin"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 1024,
-    "model_name": "gpt-2",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "generate"
-    ],
-    "model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": "1_5",
-        "quantizations": [
-          "none"
-        ],
-        "model_id": "openai-community/gpt2",
-        "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "internlm-7b",
+    "context_length": 32768,
+    "model_name": "qwen2-instruct",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "generate"
+      "chat",
+      "tools"
     ],
-    "model_description": "InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios.",
+    "model_description": "Qwen2 is the new series of Qwen large language models",
     "model_specs": [
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 7,
+        "model_size_in_billions": "0_5",
         "quantizations": [
           "4-bit",
           "8-bit",
           "none"
         ],
-        "model_id": "internlm/internlm-7b",
-        "model_revision": "592b0efc83be3eb1cba8990c4caf41ce604b958c"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 4096,
-    "model_name": "internlm-chat-7b",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.",
-    "model_specs": [
+        "model_id": "Qwen/Qwen2-0.5B-Instruct"
+      },
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 7,
+        "model_size_in_billions": "1_5",
         "quantizations": [
           "4-bit",
           "8-bit",
           "none"
         ],
-        "model_id": "internlm/internlm-chat-7b",
-        "model_revision": "d4fa2dbcbd2fa4edfa6735aa2ba0f0577fed6a62"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "INTERNLM",
-      "system_prompt": "",
-      "roles": [
-        "<|User|>",
-        "<|Bot|>"
-      ],
-      "intra_message_sep": "<eoh>\n",
-      "inter_message_sep": "<eoa>\n",
-      "stop_token_ids": [
-        1,
-        103028
-      ],
-      "stop": [
-        "<eoa>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 16384,
-    "model_name": "internlm-20b",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "generate"
-    ],
-    "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.",
-    "model_specs": [
+        "model_id": "Qwen/Qwen2-1.5B-Instruct"
+      },
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 20,
+        "model_size_in_billions": 7,
         "quantizations": [
           "4-bit",
           "8-bit",
           "none"
         ],
-        "model_id": "internlm/internlm-20b",
-        "model_revision": "c56a72957239b490ea206ea857e86611b3f65f3a"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 16384,
-    "model_name": "internlm-chat-20b",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.",
-    "model_specs": [
+        "model_id": "Qwen/Qwen2-7B-Instruct"
+      },
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 20,
+        "model_size_in_billions": 72,
         "quantizations": [
           "4-bit",
           "8-bit",
           "none"
         ],
-        "model_id": "internlm/internlm-chat-20b",
-        "model_revision": "c67e80e42c4950ebae18a955c9fe138c5ceb5b10"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "INTERNLM",
+        "model_id": "Qwen/Qwen2-72B-Instruct"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct-GPTQ-{quantization}"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-moe-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen2 is the new series of Qwen large language models. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-57B-A14B-Instruct"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "starcoder",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Starcoder is an open-source Transformer based LLM that is trained on permissively licensed data from GitHub.",
+    "model_specs": [
+      {
+        "model_format": "ggmlv3",
+        "model_size_in_billions": 16,
+        "quantizations": [
+          "q4_0",
+          "q4_1",
+          "q5_0",
+          "q5_1",
+          "q8_0"
+        ],
+        "model_id": "TheBloke/starcoder-GGML",
+        "model_file_name_template": "starcoder.ggmlv3.{quantization}.bin"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 1024,
+    "model_name": "gpt-2",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "openai-community/gpt2",
+        "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "internlm-7b",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "internlm/internlm-7b",
+        "model_revision": "592b0efc83be3eb1cba8990c4caf41ce604b958c"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "internlm-chat-7b",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "internlm/internlm-chat-7b",
+        "model_revision": "d4fa2dbcbd2fa4edfa6735aa2ba0f0577fed6a62"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM",
+      "system_prompt": "",
+      "roles": [
+        "<|User|>",
+        "<|Bot|>"
+      ],
+      "intra_message_sep": "<eoh>\n",
+      "inter_message_sep": "<eoa>\n",
+      "stop_token_ids": [
+        1,
+        103028
+      ],
+      "stop": [
+        "<eoa>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 16384,
+    "model_name": "internlm-20b",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "internlm/internlm-20b",
+        "model_revision": "c56a72957239b490ea206ea857e86611b3f65f3a"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 16384,
+    "model_name": "internlm-chat-20b",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "internlm/internlm-chat-20b",
+        "model_revision": "c67e80e42c4950ebae18a955c9fe138c5ceb5b10"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM",
       "system_prompt": "",
       "roles": [
         "<|User|>",
@@ -3211,7 +3556,84 @@
         "quantizations": [
           "Int4"
         ],
-        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_S",
+          "Q3_K_M",
+          "Q3_K_L",
+          "Q4_0",
+          "Q4_K_S",
+          "Q4_K_M",
+          "Q5_0",
+          "Q5_K_S",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
+        "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA2",
+      "system_prompt": "[INST] ",
+      "roles": [
+        "[INST]",
+        "[/INST]"
+      ],
+      "intra_message_sep": " ",
+      "inter_message_sep": "<s>",
+      "stop_token_ids": [
+        2
+      ],
+      "stop": [
+        "</s>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "mistral-instruct-v0.3",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+        "model_revision": "83e9aa141f2e28c82232fea5325f54edf17c43de"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "solidrust/Mistral-7B-Instruct-v0.3-AWQ"
       },
       {
         "model_format": "ggufv2",
@@ -3221,17 +3643,16 @@
           "Q3_K_S",
           "Q3_K_M",
           "Q3_K_L",
-          "Q4_0",
           "Q4_K_S",
           "Q4_K_M",
-          "Q5_0",
           "Q5_K_S",
           "Q5_K_M",
           "Q6_K",
-          "Q8_0"
+          "Q8_0",
+          "fp16"
         ],
-        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
-        "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf"
+        "model_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
+        "model_file_name_template": "Mistral-7B-Instruct-v0.3.{quantization}.gguf"
       }
     ],
     "prompt_style": {
@@ -3251,6 +3672,49 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "codestral-v0.1",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Codestrall-22B-v0.1 is trained on a diverse dataset of 80+ programming languages, including the most popular ones, such as Python, Java, C, C++, JavaScript, and Bash",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 22,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
+        "model_revision": "9552e7b1d9b2d5bbd87a5aa7221817285dbb6366"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 22,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_S",
+          "Q3_K_M",
+          "Q3_K_L",
+          "Q4_K_S",
+          "Q4_K_M",
+          "Q5_K_S",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "bartowski/Codestral-22B-v0.1-GGUF",
+        "model_file_name_template": "Codestral-22B-v0.1-{quantization}.gguf"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 8192,
@@ -3740,39 +4204,253 @@
           "8-bit",
           "none"
         ],
-        "model_id": "01-ai/Yi-6B-Chat",
-        "model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b"
+        "model_id": "01-ai/Yi-6B-Chat",
+        "model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-34B-Chat",
+        "model_revision": "a99ec35331cbfc9da596af7d4538fe2efecff03c"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/Yi-34B-Chat-GGUF",
+        "model_file_name_template": "yi-34b-chat.{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-6B",
+        "model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-9B",
+        "model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-34B",
+        "model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-6B-Chat",
+        "model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-9B-Chat",
+        "model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-34B-Chat",
+        "model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "Q3_K_L",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "f32"
+        ],
+        "model_id": "lmstudio-community/Yi-1.5-6B-Chat-GGUF",
+        "model_file_name_template": "Yi-1.5-6B-Chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q3_K_L",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "f32"
+        ],
+        "model_id": "lmstudio-community/Yi-1.5-9B-Chat-GGUF",
+        "model_file_name_template": "Yi-1.5-9B-Chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "lmstudio-community/Yi-1.5-34B-Chat-GGUF",
+        "model_file_name_template": "Yi-1.5-34B-Chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "modelscope/Yi-1.5-6B-Chat-GPTQ",
+        "model_revision": "2ad3a602e64d1c79e28e6e92beced2935047367c"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "modelscope/Yi-1.5-9B-Chat-GPTQ",
+        "model_revision": "76f47d16982923f7b6674c4e23ddac7c3b1d2e03"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "modelscope/Yi-1.5-34B-Chat-GPTQ",
+        "model_revision": "173fb4036265b2dac1d6296a8e2fd2f652c19968"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "modelscope/Yi-1.5-6B-Chat-AWQ",
+        "model_revision": "23bf37f1666874e15e239422de0d3948d8735fa9"
       },
       {
-        "model_format": "pytorch",
-        "model_size_in_billions": 34,
+        "model_format": "awq",
+        "model_size_in_billions": 9,
         "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
+          "Int4"
         ],
-        "model_id": "01-ai/Yi-34B-Chat",
-        "model_revision": "a99ec35331cbfc9da596af7d4538fe2efecff03c"
+        "model_id": "modelscope/Yi-1.5-9B-Chat-AWQ",
+        "model_revision": "2605f388332672789eae1f422644add2901b433f"
       },
       {
-        "model_format": "ggufv2",
+        "model_format": "awq",
         "model_size_in_billions": 34,
         "quantizations": [
-          "Q2_K",
-          "Q3_K_L",
-          "Q3_K_M",
-          "Q3_K_S",
-          "Q4_0",
-          "Q4_K_M",
-          "Q4_K_S",
-          "Q5_0",
-          "Q5_K_M",
-          "Q5_K_S",
-          "Q6_K",
-          "Q8_0"
+          "Int4"
         ],
-        "model_id": "TheBloke/Yi-34B-Chat-GGUF",
-        "model_file_name_template": "yi-34b-chat.{quantization}.gguf"
+        "model_id": "modelscope/Yi-1.5-34B-Chat-AWQ",
+        "model_revision": "26234fea6ac49d456f32f8017289021fb1087a04"
       }
     ],
     "prompt_style": {
@@ -3800,28 +4478,17 @@
   },
   {
     "version": 1,
-    "context_length": 4096,
-    "model_name": "Yi-1.5",
+    "context_length": 16384,
+    "model_name": "Yi-1.5-chat-16k",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
     "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
     "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "01-ai/Yi-1.5-6B",
-        "model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 9,
@@ -3830,8 +4497,8 @@
           "8-bit",
           "none"
         ],
-        "model_id": "01-ai/Yi-1.5-9B",
-        "model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2"
+        "model_id": "01-ai/Yi-1.5-9B-Chat-16K",
+        "model_revision": "551220fb24d69b6bfec5defceeb160395ce5da8d"
       },
       {
         "model_format": "pytorch",
@@ -3841,56 +4508,48 @@
           "8-bit",
           "none"
         ],
-        "model_id": "01-ai/Yi-1.5-34B",
-        "model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 4096,
-    "model_name": "Yi-1.5-chat",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_id": "01-ai/Yi-1.5-6B-Chat",
-        "model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080"
+        "model_id": "01-ai/Yi-1.5-34B-Chat-16K",
+        "model_revision": "dfdbc67be750972bfcc1ac7ffd7fe48689c856fd"
       },
       {
-        "model_format": "pytorch",
+        "model_format": "ggufv2",
         "model_size_in_billions": 9,
         "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
         ],
-        "model_id": "01-ai/Yi-1.5-9B-Chat",
-        "model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6"
+        "model_id": "QuantFactory/Yi-1.5-9B-Chat-16K-GGUF",
+        "model_file_name_template": "Yi-1.5-9B-Chat-16K.{quantization}.gguf"
       },
       {
-        "model_format": "pytorch",
+        "model_format": "ggufv2",
         "model_size_in_billions": 34,
         "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
         ],
-        "model_id": "01-ai/Yi-1.5-34B-Chat",
-        "model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e"
+        "model_id": "bartowski/Yi-1.5-34B-Chat-16K-GGUF",
+        "model_file_name_template": "Yi-1.5-34B-Chat-16K-{quantization}.gguf"
       }
     ],
     "prompt_style": {
@@ -5063,6 +5722,48 @@
       ]
     }
   },
+  {
+    "version":1,
+    "context_length":2048,
+    "model_name":"MiniCPM-Llama3-V-2_5",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "none"
+        ],
+        "model_id":"openbmb/MiniCPM-Llama3-V-2_5",
+        "model_revision":"285a637ba8a30a0660dfcccad16f9a864f75abfd"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "int4"
+        ],
+        "model_id":"openbmb/MiniCPM-Llama3-V-2_5-{quantization}",
+        "model_revision":"f92aff28552de35de3be204e8fe292dd4824e544"
+      }
+    ],
+    "prompt_style":{
+      "style_name":"OmniLMM",
+      "system_prompt":"The role of first msg should be user",
+      "roles":[
+        "user",
+        "assistant"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -6009,23 +6710,32 @@
     ],
     "model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
     "model_specs": [
+      {
+          "model_format": "pytorch",
+          "model_size_in_billions": 2,
+          "quantizations": [
+              "none"
+          ],
+          "model_id": "OpenGVLab/Mini-InternVL-Chat-2B-V1-5",
+          "model_revision": "ce3f67acff17281bacbf4b156f402a0580fb9605"
+        },
         {
-            "model_format": "pytorch",
-            "model_size_in_billions": 26,
-            "quantizations": [
-                "none"
-            ],
-            "model_id": "OpenGVLab/InternVL-Chat-V1-5",
-            "model_revision": "e822119e5806946ce128043023a73d715ecabf8d"
+          "model_format": "pytorch",
+          "model_size_in_billions": 26,
+          "quantizations": [
+              "none"
+          ],
+          "model_id": "OpenGVLab/InternVL-Chat-V1-5",
+          "model_revision": "e822119e5806946ce128043023a73d715ecabf8d"
         },
         {
-            "model_format": "pytorch",
-            "model_size_in_billions": 26,
-            "quantizations": [
-                "Int8"
-            ],
-            "model_id": "OpenGVLab/InternVL-Chat-V1-5-{quantization}",
-            "model_revision": "acaaed06937c603ab04f084216ecb0268160f538"
+          "model_format": "pytorch",
+          "model_size_in_billions": 26,
+          "quantizations": [
+              "Int8"
+          ],
+          "model_id": "OpenGVLab/InternVL-Chat-V1-5-{quantization}",
+          "model_revision": "acaaed06937c603ab04f084216ecb0268160f538"
         }
     ],
     "prompt_style": {
@@ -6043,11 +6753,11 @@
             "<|im_end|>"
         ]
     }
-},
+  },
   {
     "version": 1,
-    "context_length": 32768,
-    "model_name": "mini-internvl-chat",
+    "context_length": 8192,
+    "model_name": "cogvlm2",
     "model_lang": [
         "en",
         "zh"
@@ -6056,32 +6766,125 @@
         "chat",
         "vision"
     ],
-    "model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
+    "model_description": "CogVLM2 have achieved good results in many lists compared to the previous generation of CogVLM open source models. Its excellent performance can compete with some non-open source models.",
     "model_specs": [
-        {
-            "model_format": "pytorch",
-            "model_size_in_billions": 2,
-            "quantizations": [
-                "none"
-            ],
-            "model_id": "OpenGVLab/Mini-InternVL-Chat-2B-V1-5",
-            "model_revision": "ce3f67acff17281bacbf4b156f402a0580fb9605"
-        }
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B",
+        "model_revision": "d88b352bce5ee58a289b1ac8328553eb31efa2ef"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 20,
+        "quantizations": [
+          "int4"
+        ],
+        "model_id": "THUDM/cogvlm2-llama3-chinese-chat-19B-{quantization}",
+        "model_revision": "7863e362174f4718c2fe9cba4befd0b580a3194f"
+      }
     ],
     "prompt_style": {
-        "style_name": "INTERNLM2",
-        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
-        "roles": [
-            "<|im_start|>user",
-            "<|im_start|>assistant"
+      "style_name": "LLAMA3",
+      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<|eot_id|>",
+      "stop_token_ids": [
+        128001,
+        128009
+      ],
+      "stop": [
+        "<|end_of_text|>",
+        "<|eot_id|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "telechat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The TeleChat is a large language model developed and trained by China Telecom Artificial Intelligence Technology Co., LTD. The 7B model base is trained with 1.5 trillion Tokens and 3 trillion Tokens and Chinese high-quality corpus.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
         ],
-        "intra_message_sep": "<|im_end|>",
-        "stop_token_ids": [
-            92542
+        "model_id": "Tele-AI/telechat-7B"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "int4",
+          "int8"
         ],
-        "stop": [
-            "<|im_end|>"
-        ]
+        "model_id": "Tele-AI/telechat-7B-{quantization}"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Tele-AI/TeleChat-12B"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "int4",
+          "int8"
+        ],
+        "model_id": "Tele-AI/TeleChat-12B-{quantization}"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 52,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Tele-AI/TeleChat-52B"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "NO_COLON_TWO",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "<_user>",
+        "<_bot>"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": "",
+      "stop": [
+        "<_end>",
+        "<_start>"
+      ],
+      "stop_token_ids": [
+        160133,
+        160132
+      ]
     }
-}
+  }
 ]

xinference 0.11.2.post1__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

xinference 0.11.2.post1py3-none-any.whl → 0.12.0py3-none-any.whl