PyPI - xinference - Versions diffs - 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

xinference 0.11.3py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (30) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +69 -0
xinference/client/restful/restful_client.py +70 -0
xinference/constants.py +4 -0
xinference/core/model.py +141 -12
xinference/core/scheduler.py +428 -0
xinference/core/supervisor.py +26 -0
xinference/isolation.py +9 -2
xinference/model/audio/chattts.py +84 -0
xinference/model/audio/core.py +10 -3
xinference/model/audio/model_spec.json +20 -0
xinference/model/llm/__init__.py +4 -0
xinference/model/llm/llm_family.json +507 -1
xinference/model/llm/llm_family_modelscope.json +409 -2
xinference/model/llm/pytorch/chatglm.py +2 -1
xinference/model/llm/pytorch/cogvlm2.py +76 -17
xinference/model/llm/pytorch/core.py +91 -6
xinference/model/llm/pytorch/glm4v.py +258 -0
xinference/model/llm/pytorch/minicpmv25.py +232 -0
xinference/model/llm/pytorch/utils.py +386 -2
xinference/model/llm/vllm/core.py +6 -0
xinference/thirdparty/ChatTTS/__init__.py +1 -0
xinference/thirdparty/ChatTTS/core.py +200 -0
xinference/types.py +3 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/METADATA +26 -9
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/RECORD +30 -24
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/LICENSE +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/WHEEL +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/entry_points.txt +0 -0
{xinference-0.11.3.dist-info → xinference-0.12.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -522,6 +522,142 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "glm4-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-4-9b-chat",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 1048576,
+    "model_name": "glm4-chat-1m",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-4-9b-chat-1m",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "glm-4v",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "GLM4 is the open source version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/glm-4v-9b",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -2648,6 +2784,233 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2 is the new series of Qwen large language models",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2-1.5B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2-7B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2-72B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2-1.5B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2-7B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4",
+          "Int8"
+        ],
+        "model_id": "qwen/Qwen2-72B-Instruct-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-1.5B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-7B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-moe-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen2 is the new series of Qwen large language models. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen2-57B-A14B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -3236,7 +3599,7 @@
       "chat",
       "vision"
     ],
-    "model_description":"mniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
+    "model_description":"OmniLMM is a family of open-source large multimodal models (LMMs) adept at vision & language modeling.",
     "model_specs":[
       {
         "model_format":"pytorch",
@@ -3468,6 +3831,50 @@
       ]
     }
   },
+  {
+    "version":1,
+    "context_length":2048,
+    "model_name":"MiniCPM-Llama3-V-2_5",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"MiniCPM-Llama3-V 2.5 is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"OpenBMB/MiniCPM-Llama3-V-2_5",
+        "model_revision":"master"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"OpenBMB/MiniCPM-Llama3-V-2_5-{quantization}",
+        "model_revision":"master"
+      }
+    ],
+    "prompt_style":{
+      "style_name":"OmniLMM",
+      "system_prompt":"The role of first msg should be user",
+      "roles":[
+        "user",
+        "assistant"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -3860,7 +4267,7 @@
             "<|im_end|>"
         ]
     }
-},
+  },
   {
     "version": 1,
     "context_length": 8192,

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -82,7 +82,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
     ) -> bool:
         if llm_spec.model_format != "pytorch":
             return False
-        if "chatglm" not in llm_family.model_name:
+        model_family = llm_family.model_family or llm_family.model_name
+        if "chatglm" not in model_family and "glm4" not in model_family:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference/model/llm/pytorch/cogvlm2.py CHANGED Viewed

@@ -30,6 +30,7 @@ from ....types import (
     ChatCompletionMessage,
     Completion,
     CompletionChoice,
+    CompletionChunk,
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -183,10 +184,7 @@ class CogVLM2Model(PytorchChatModel):
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         system_prompt = system_prompt if system_prompt else ""
-        if generate_config and generate_config.get("stream"):
-            raise Exception(
-                f"Chat with model {self.model_family.model_name} does not support stream."
-            )
+        stream = generate_config.get("stream", False) if generate_config else False
         sanitized_config = {
             "pad_token_id": 128002,
@@ -234,24 +232,85 @@ class CogVLM2Model(PytorchChatModel):
             if image is not None
             else None,
         }
-        with torch.no_grad():
-            outputs = self._model.generate(**inputs, **sanitized_config)
-            outputs = outputs[:, inputs["input_ids"].shape[1] :]
-            response = self._tokenizer.decode(outputs[0])
-            response = response.split("<|end_of_text|>")[0]
-        chunk = Completion(
-            id=str(uuid.uuid1()),
+        if stream:
+            it = self._streaming_chat_response(inputs, sanitized_config)
+            return self._to_chat_completion_chunks(it)
+        else:
+            with torch.no_grad():
+                outputs = self._model.generate(**inputs, **sanitized_config)
+                outputs = outputs[:, inputs["input_ids"].shape[1] :]
+                response = self._tokenizer.decode(outputs[0])
+                response = response.split("<|end_of_text|>")[0]
+            chunk = Completion(
+                id=str(uuid.uuid1()),
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[
+                    CompletionChoice(
+                        index=0, text=response, finish_reason="stop", logprobs=None
+                    )
+                ],
+                usage=CompletionUsage(
+                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+                ),
+            )
+            return self._to_chat_completion(chunk)
+    def _streaming_chat_response(
+        self, inputs: Dict, config: Dict
+    ) -> Iterator[CompletionChunk]:
+        from threading import Thread
+        from transformers import TextIteratorStreamer
+        streamer = TextIteratorStreamer(
+            self._tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        generation_kwargs = {
+            "input_ids": inputs["input_ids"],
+            "attention_mask": inputs["attention_mask"],
+            "token_type_ids": inputs["token_type_ids"],
+            "images": inputs["images"],
+            "max_new_tokens": config["max_new_tokens"],
+            "pad_token_id": config["pad_token_id"],
+            "streamer": streamer,
+        }
+        thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
+        thread.start()
+        completion_id = str(uuid.uuid1())
+        for new_text in streamer:
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[
+                    CompletionChoice(
+                        index=0, text=new_text, finish_reason=None, logprobs=None
+                    )
+                ],
+                usage=CompletionUsage(
+                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+                ),
+            )
+            yield chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        chunk = CompletionChunk(
+            id=completion_id,
             object="text_completion",
             created=int(time.time()),
             model=self.model_uid,
-            choices=[
-                CompletionChoice(
-                    index=0, text=response, finish_reason="stop", logprobs=None
-                )
-            ],
+            choices=[completion_choice],
             usage=CompletionUsage(
                 prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
             ),
         )
-        return self._to_chat_completion(chunk)
+        yield chunk

xinference 0.11.3__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

xinference 0.11.3py3-none-any.whl → 0.12.0py3-none-any.whl