PyPI - xinference - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

xinference 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (70) hide show

xinference/model/image/model_spec.json CHANGED Viewed

@@ -92,5 +92,19 @@
         "model_revision": "62134b9d8e703b5d6f74f1534457287a8bba77ef"
       }
     ]
+  },
+  {
+    "model_name": "stable-diffusion-inpainting",
+    "model_family": "stable_diffusion",
+    "model_id": "runwayml/stable-diffusion-inpainting",
+    "model_revision": "51388a731f57604945fddd703ecb5c50e8e7b49d",
+    "ability": "inpainting"
+  },
+  {
+    "model_name": "stable-diffusion-2-inpainting",
+    "model_family": "stable_diffusion",
+    "model_id": "stabilityai/stable-diffusion-2-inpainting",
+    "model_revision": "81a84f49b15956b60b4272a405ad3daef3da4590",
+    "ability": "inpainting"
   }
 ]

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -16,6 +16,7 @@ import base64
 import logging
 import os
 import re
+import sys
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
@@ -39,6 +40,7 @@ class DiffusionModel:
         lora_model: Optional[List[LoRA]] = None,
         lora_load_kwargs: Optional[Dict] = None,
         lora_fuse_kwargs: Optional[Dict] = None,
+        ability: Optional[str] = None,
         **kwargs,
     ):
         self._model_uid = model_uid
@@ -48,6 +50,7 @@ class DiffusionModel:
         self._lora_model = lora_model
         self._lora_load_kwargs = lora_load_kwargs or {}
         self._lora_fuse_kwargs = lora_fuse_kwargs or {}
+        self._ability = ability
         self._kwargs = kwargs
     def _apply_lora(self):
@@ -64,8 +67,14 @@ class DiffusionModel:
             logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
     def load(self):
-        # import torch
-        from diffusers import AutoPipelineForText2Image
+        import torch
+        if self._ability in [None, "text2image", "image2image"]:
+            from diffusers import AutoPipelineForText2Image as AutoPipelineModel
+        elif self._ability == "inpainting":
+            from diffusers import AutoPipelineForInpainting as AutoPipelineModel
+        else:
+            raise ValueError(f"Unknown ability: {self._ability}")
         controlnet = self._kwargs.get("controlnet")
         if controlnet is not None:
@@ -74,12 +83,16 @@ class DiffusionModel:
             logger.debug("Loading controlnet %s", controlnet)
             self._kwargs["controlnet"] = ControlNetModel.from_pretrained(controlnet)
-        self._model = AutoPipelineForText2Image.from_pretrained(
+        torch_dtype = self._kwargs.get("torch_dtype")
+        if sys.platform != "darwin" and torch_dtype is None:
+            # The following params crashes on Mac M2
+            self._kwargs["torch_dtype"] = torch.float16
+            self._kwargs["use_safetensors"] = True
+        logger.debug("Loading model %s", AutoPipelineModel)
+        self._model = AutoPipelineModel.from_pretrained(
             self._model_path,
             **self._kwargs,
-            # The following params crashes on Mac M2
-            # torch_dtype=torch.float16,
-            # use_safetensors=True,
         )
         self._model = move_model_to_available_device(self._model)
         # Recommended if your computer has < 64 GB of RAM
@@ -174,3 +187,27 @@ class DiffusionModel:
             response_format=response_format,
             **kwargs,
         )
+    def inpainting(
+        self,
+        image: bytes,
+        mask_image: bytes,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        n: int = 1,
+        size: str = "1024*1024",
+        response_format: str = "url",
+        **kwargs,
+    ):
+        width, height = map(int, re.split(r"[^\d]+", size))
+        return self._call_model(
+            image=image,
+            mask_image=mask_image,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_images_per_prompt=n,
+            response_format=response_format,
+            **kwargs,
+        )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -112,7 +112,6 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
-    from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
     from .mlx.core import MLXChatModel, MLXModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
@@ -143,7 +142,6 @@ def _install():
     # register llm classes.
     LLAMA_CLASSES.extend(
         [
-            ChatglmCppChatModel,
             LlamaCppChatModel,
             LlamaCppModel,
         ]

xinference/model/llm/core.py CHANGED Viewed

@@ -20,7 +20,7 @@ import platform
 from abc import abstractmethod
 from collections import defaultdict
 from functools import lru_cache
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
 from ...core.utils import parse_replica_model_uid
 from ...types import PeftModelConfig
@@ -193,6 +193,7 @@ def create_llm_model_instance(
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     peft_model_config: Optional[PeftModelConfig] = None,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
     **kwargs,
 ) -> Tuple[LLM, LLMDescription]:
     from .llm_family import cache, check_engine_by_spec_parameters, match_llm
@@ -200,7 +201,7 @@ def create_llm_model_instance(
     if model_engine is None:
         raise ValueError("model_engine is required for LLM model")
     match_result = match_llm(
-        model_name, model_format, model_size_in_billions, quantization
+        model_name, model_format, model_size_in_billions, quantization, download_hub
     )
     if not match_result:

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -25,7 +25,6 @@ from ....types import (
     CompletionChunk,
     CompletionUsage,
     CreateCompletionLlamaCpp,
-    Embedding,
     LlamaCppGenerateConfig,
     LlamaCppModelConfig,
 )
@@ -65,7 +64,6 @@ class LlamaCppModel(LLM):
         if self.model_family.context_length:
             llamacpp_model_config.setdefault("n_ctx", self.model_family.context_length)
-        llamacpp_model_config.setdefault("embedding", True)
         llamacpp_model_config.setdefault("use_mmap", False)
         llamacpp_model_config.setdefault("use_mlock", True)
@@ -185,7 +183,7 @@ class LlamaCppModel(LLM):
     ) -> bool:
         if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
             return False
-        if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
+        if "qwen" in llm_family.model_name:
             return False
         if "generate" not in llm_family.model_ability:
             return False
@@ -261,11 +259,6 @@ class LlamaCppModel(LLM):
         else:
             return generator_wrapper(prompt, generate_config)
-    def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
-        assert self._llm is not None
-        embedding = self._llm.create_embedding(input)
-        return embedding
 class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
     def __init__(
@@ -292,8 +285,6 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
     ) -> bool:
         if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
             return False
-        if "chatglm" in llm_family.model_name:
-            return False
         if "chat" not in llm_family.model_ability:
             return False
         return True

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -574,19 +574,6 @@
     ],
     "model_description": "ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_id": "Xorbits/chatglm-6B-GGML",
-        "model_file_name_template": "chatglm-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -622,19 +609,6 @@
     ],
     "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_id": "Xorbits/chatglm2-6B-GGML",
-        "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -706,15 +680,6 @@
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0"
-        ],
-        "model_id": "Xorbits/chatglm3-6B-GGML",
-        "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -855,6 +820,32 @@
         ],
         "model_id": "THUDM/glm-4-9b-chat",
         "model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
+        "model_id": "legraphista/glm-4-9b-chat-GGUF",
+        "model_revision": "0155a14edf0176863e9a003cdd78ce599e4d62c0"
       }
     ],
     "prompt_style": {
@@ -900,6 +891,32 @@
         ],
         "model_id": "THUDM/glm-4-9b-chat-1m",
         "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
+        "model_id": "legraphista/glm-4-9b-chat-1m-GGUF",
+        "model_revision": "782e28bd5eee3c514c07108da15e0b5e06dcf776"
       }
     ],
     "prompt_style": {
@@ -966,6 +983,65 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "codegeex4",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "the open-source version of the latest CodeGeeX4 model series",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/codegeex4-all-9b",
+        "model_revision": "8c4ec1d2f2888412640825a7aa23355939a8f4c6"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K_L",
+          "Q8_0"
+        ],
+        "model_file_name_template": "codegeex4-all-9b-{quantization}.gguf",
+        "model_id": "THUDM/codegeex4-all-9b-GGUF",
+        "model_revision": "6a04071c54c943949826d4815ee00717ed8cf153"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -5774,7 +5850,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 32768,
     "model_name": "internlm2-chat",
     "model_lang": [
       "en",
@@ -5822,6 +5898,140 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "internlm2.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "InternLM2.5 series of the InternLM model.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "internlm/internlm2_5-7b-chat",
+        "model_revision": "9dc8536a922ab4954726aad1b37fa199004a291a"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "ModelCloud/internlm-2.5-7b-chat-gptq-4bit",
+        "model_revision": "2e2dda735c326544921a4035bbeb6c6e316a8254"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "internlm/internlm2_5-7b-chat-gguf",
+        "model_file_name_template": "internlm2_5-7b-chat-{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "stop_token_ids": [
+        2,
+        92542
+      ],
+      "stop": [
+        "</s>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 262144,
+    "model_name": "internlm2.5-chat-1m",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "InternLM2.5 series of the InternLM model supports 1M long-context",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "internlm/internlm2_5-7b-chat-1m",
+        "model_revision": "8d1a709a04d71440ef3df6ebbe204672f411c8b6"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "ModelCloud/internlm-2.5-7b-chat-1m-gptq-4bit",
+        "model_revision": "022e59cb30f03b271d56178478acb038b2b9b58c"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "internlm/internlm2_5-7b-chat-1m-gguf",
+        "model_file_name_template": "internlm2_5-7b-chat-1m-{quantization}.gguf"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "stop_token_ids": [
+        2,
+        92542
+      ],
+      "stop": [
+        "</s>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version":1,
     "context_length":2048,
@@ -6175,6 +6385,52 @@
         ],
         "model_id": "google/gemma-2-27b-it"
       },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "f32"
+        ],
+        "model_id": "bartowski/gemma-2-9b-it-GGUF",
+        "model_file_name_template": "gemma-2-9b-it-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "f32"
+        ],
+        "model_id": "bartowski/gemma-2-27b-it-GGUF",
+        "model_file_name_template": "gemma-2-27b-it-{quantization}.gguf"
+      },
       {
         "model_format": "mlx",
         "model_size_in_billions": 9,

xinference 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

Potentially problematic release.

xinference 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl