PyPI - xinference - Versions diffs - 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl - Mend

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (194) hide show

xinference/model/audio/fish_speech.py ADDED Viewed

@@ -0,0 +1,228 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import logging
+import os.path
+import queue
+import sys
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+import numpy as np
+import torch
+from ...device_utils import get_available_device, is_device_available
+if TYPE_CHECKING:
+    from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    import wave
+    buffer = BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+class FishSpeechModel:
+    def __init__(
+        self,
+        model_uid: str,
+        model_path: str,
+        model_spec: "AudioModelFamilyV1",
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self._model_uid = model_uid
+        self._model_path = model_path
+        self._model_spec = model_spec
+        self._device = device
+        self._llama_queue = None
+        self._model = None
+        self._kwargs = kwargs
+    def load(self):
+        # There are too many imports from fish_speech.
+        sys.path.insert(
+            0, os.path.join(os.path.dirname(__file__), "../../thirdparty/fish_speech")
+        )
+        from tools.llama.generate import launch_thread_safe_queue
+        from tools.vqgan.inference import load_model as load_decoder_model
+        if self._device is None:
+            self._device = get_available_device()
+        else:
+            if not is_device_available(self._device):
+                raise ValueError(f"Device {self._device} is not available!")
+        logger.info("Loading Llama model...")
+        self._llama_queue = launch_thread_safe_queue(
+            checkpoint_path=self._model_path,
+            device=self._device,
+            precision=torch.bfloat16,
+            compile=False,
+        )
+        logger.info("Llama model loaded, loading VQ-GAN model...")
+        checkpoint_path = os.path.join(
+            self._model_path,
+            "firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
+        )
+        self._model = load_decoder_model(
+            config_name="firefly_gan_vq",
+            checkpoint_path=checkpoint_path,
+            device=self._device,
+        )
+    @torch.inference_mode()
+    def _inference(
+        self,
+        text,
+        enable_reference_audio,
+        reference_audio,
+        reference_text,
+        max_new_tokens,
+        chunk_length,
+        top_p,
+        repetition_penalty,
+        temperature,
+        streaming=False,
+    ):
+        from fish_speech.utils import autocast_exclude_mps
+        from tools.api import decode_vq_tokens, encode_reference
+        from tools.llama.generate import (
+            GenerateRequest,
+            GenerateResponse,
+            WrappedGenerateResponse,
+        )
+        # Parse reference audio aka prompt
+        prompt_tokens = encode_reference(
+            decoder_model=self._model,
+            reference_audio=reference_audio,
+            enable_reference_audio=enable_reference_audio,
+        )
+        # LLAMA Inference
+        request = dict(
+            device=self._model.device,
+            max_new_tokens=max_new_tokens,
+            text=text,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            compile=False,
+            iterative_prompt=chunk_length > 0,
+            chunk_length=chunk_length,
+            max_length=2048,
+            prompt_tokens=prompt_tokens if enable_reference_audio else None,
+            prompt_text=reference_text if enable_reference_audio else None,
+        )
+        response_queue = queue.Queue()
+        self._llama_queue.put(
+            GenerateRequest(
+                request=request,
+                response_queue=response_queue,
+            )
+        )
+        if streaming:
+            yield wav_chunk_header(), None, None
+        segments = []
+        while True:
+            result: WrappedGenerateResponse = response_queue.get()
+            if result.status == "error":
+                raise Exception(str(result.response))
+            result: GenerateResponse = result.response
+            if result.action == "next":
+                break
+            with autocast_exclude_mps(
+                device_type=self._model.device.type, dtype=torch.bfloat16
+            ):
+                fake_audios = decode_vq_tokens(
+                    decoder_model=self._model,
+                    codes=result.codes,
+                )
+            fake_audios = fake_audios.float().cpu().numpy()
+            segments.append(fake_audios)
+            if streaming:
+                yield (fake_audios * 32768).astype(np.int16).tobytes(), None, None
+        if len(segments) == 0:
+            raise Exception("No audio generated, please check the input text.")
+        # No matter streaming or not, we need to return the final audio
+        audio = np.concatenate(segments, axis=0)
+        yield None, (self._model.spec_transform.sample_rate, audio), None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+    def speech(
+        self,
+        input: str,
+        voice: str,
+        response_format: str = "mp3",
+        speed: float = 1.0,
+        stream: bool = False,
+        **kwargs,
+    ):
+        logger.warning("Fish speech does not support setting voice: %s.", voice)
+        if speed != 1.0:
+            logger.warning("Fish speech does not support setting speed: %s.", speed)
+        if stream is True:
+            logger.warning("stream mode is not implemented.")
+        import torchaudio
+        result = list(
+            self._inference(
+                text=input,
+                enable_reference_audio=False,
+                reference_audio=None,
+                reference_text="",
+                max_new_tokens=0,
+                chunk_length=100,
+                top_p=0.7,
+                repetition_penalty=1.2,
+                temperature=0.7,
+            )
+        )
+        sample_rate, audio = result[0][1]
+        audio = np.array([audio])
+        # Save the generated audio
+        with BytesIO() as out:
+            torchaudio.save(
+                out, torch.from_numpy(audio), sample_rate, format=response_format
+            )
+            return out.getvalue()

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -146,5 +146,13 @@
     "model_revision": "fb5f676733139f35670bed9b59a77d476b1aa898",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "FishSpeech-1.2-SFT",
+    "model_family": "FishAudio",
+    "model_id": "fishaudio/fish-speech-1.2-sft",
+    "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/embedding/core.py CHANGED Viewed

@@ -154,10 +154,32 @@ class EmbeddingModel:
             "gte" in self._model_spec.model_name.lower()
             and "qwen2" in self._model_spec.model_name.lower()
         ):
+            import torch
+            torch_dtype_str = self._kwargs.get("torch_dtype")
+            if torch_dtype_str is not None:
+                try:
+                    torch_dtype = getattr(torch, torch_dtype_str)
+                    if torch_dtype not in [
+                        torch.float16,
+                        torch.float32,
+                        torch.bfloat16,
+                    ]:
+                        logger.warning(
+                            f"Load embedding model with unsupported torch dtype :  {torch_dtype_str}. Using default torch dtype: fp32."
+                        )
+                        torch_dtype = torch.float32
+                except AttributeError:
+                    logger.warning(
+                        f"Load embedding model with  unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
+                    )
+                    torch_dtype = torch.float32
+            else:
+                torch_dtype = "auto"
             self._model = XSentenceTransformer(
                 self._model_path,
                 device=self._device,
-                model_kwargs={"device_map": "auto"},
+                model_kwargs={"device_map": "auto", "torch_dtype": torch_dtype},
             )
         else:
             self._model = SentenceTransformer(self._model_path, device=self._device)

xinference/model/image/model_spec.json CHANGED Viewed

@@ -24,7 +24,8 @@
     "model_revision": "ea42f8cef0f178587cf766dc8129abd379c90671",
     "model_ability": [
       "text2image",
-      "image2image"
+      "image2image",
+      "inpainting"
     ]
   },
   {

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -27,7 +27,8 @@
     "model_revision": "master",
     "model_ability": [
       "text2image",
-      "image2image"
+      "image2image",
+      "inpainting"
     ]
   },
   {

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -24,6 +24,9 @@ from functools import partial
 from io import BytesIO
 from typing import Dict, List, Optional, Union
+import PIL.Image
+from PIL import ImageOps
 from ....constants import XINFERENCE_IMAGE_DIR
 from ....device_utils import move_model_to_available_device
 from ....types import Image, ImageList, LoRA
@@ -46,8 +49,13 @@ class DiffusionModel:
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
+        # when a model has text2image ability,
+        # it will be loaded as AutoPipelineForText2Image
+        # for image2image and inpainting,
+        # we convert to the corresponding model
         self._model = None
         self._i2i_model = None  # image to image model
+        self._inpainting_model = None  # inpainting model
         self._lora_model = lora_model
         self._lora_load_kwargs = lora_load_kwargs or {}
         self._lora_fuse_kwargs = lora_fuse_kwargs or {}
@@ -152,6 +160,10 @@ class DiffusionModel:
         model=None,
         **kwargs,
     ):
+        import gc
+        from ....device_utils import empty_cache
         logger.debug(
             "stable diffusion args: %s",
             kwargs,
@@ -159,6 +171,11 @@ class DiffusionModel:
         model = model if model is not None else self._model
         assert callable(model)
         images = model(**kwargs).images
+        # clean cache
+        gc.collect()
+        empty_cache()
         if response_format == "url":
             os.makedirs(XINFERENCE_IMAGE_DIR, exist_ok=True)
             image_list = []
@@ -209,9 +226,17 @@ class DiffusionModel:
             **kwargs,
         )
+    @staticmethod
+    def pad_to_multiple(image, multiple=8):
+        x, y = image.size
+        padding_x = (multiple - x % multiple) % multiple
+        padding_y = (multiple - y % multiple) % multiple
+        padding = (0, 0, padding_x, padding_y)
+        return ImageOps.expand(image, padding)
     def image_to_image(
         self,
-        image: bytes,
+        image: PIL.Image,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         n: int = 1,
@@ -236,6 +261,11 @@ class DiffusionModel:
             width, height = map(int, re.split(r"[^\d]+", size))
             kwargs["width"] = width
             kwargs["height"] = height
+        if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
+            # Model like SD3 image to image requires image's height and width is times of 16
+            # padding the image if specified
+            image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
         self._filter_kwargs(kwargs)
         return self._call_model(
             image=image,
@@ -258,6 +288,23 @@ class DiffusionModel:
         response_format: str = "url",
         **kwargs,
     ):
+        if "inpainting" not in self._abilities:
+            raise RuntimeError(f"{self._model_uid} does not support inpainting")
+        if (
+            "text2image" in self._abilities or "image2image" in self._abilities
+        ) and self._model is not None:
+            from diffusers import AutoPipelineForInpainting
+            if self._inpainting_model is not None:
+                model = self._inpainting_model
+            else:
+                model = self._inpainting_model = AutoPipelineForInpainting.from_pipe(
+                    self._model
+                )
+        else:
+            model = self._model
         width, height = map(int, re.split(r"[^\d]+", size))
         return self._call_model(
             image=image,
@@ -268,5 +315,6 @@ class DiffusionModel:
             width=width,
             num_images_per_prompt=n,
             response_format=response_format,
+            model=model,
             **kwargs,
         )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -34,13 +34,14 @@ from .llm_family import (
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
+    LMDEPLOY_CLASSES,
     MLX_CLASSES,
     SGLANG_CLASSES,
     SUPPORTED_ENGINES,
     TRANSFORMERS_CLASSES,
     VLLM_CLASSES,
     CustomLLMFamilyV1,
-    GgmlLLMSpecV1,
+    LlamaCppLLMSpecV1,
     LLMFamilyV1,
     LLMSpecV1,
     MLXLLMSpecV1,
@@ -55,10 +56,10 @@ from .llm_family import (
 def check_format_with_engine(model_format, engine):
-    # only llama-cpp-python support and only support ggufv2 and ggmlv3
-    if model_format in ["ggufv2", "ggmlv3"] and engine != "llama.cpp":
+    # only llama-cpp-python support and only support ggufv2
+    if model_format in ["ggufv2"] and engine != "llama.cpp":
         return False
-    if model_format not in ["ggufv2", "ggmlv3"] and engine == "llama.cpp":
+    if model_format not in ["ggufv2"] and engine == "llama.cpp":
         return False
     return True
@@ -112,28 +113,27 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
-    from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
+    from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
+    from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel
-    from .pytorch.baichuan import BaichuanPytorchChatModel
-    from .pytorch.chatglm import ChatglmPytorchChatModel
-    from .pytorch.cogvlm2 import CogVLM2Model
-    from .pytorch.core import PytorchChatModel, PytorchModel
-    from .pytorch.deepseek_vl import DeepSeekVLChatModel
-    from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
-    from .pytorch.glm4v import Glm4VModel
-    from .pytorch.intern_vl import InternVLChatModel
-    from .pytorch.internlm2 import Internlm2PytorchChatModel
-    from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
-    from .pytorch.minicpmv25 import MiniCPMV25Model
-    from .pytorch.minicpmv26 import MiniCPMV26Model
-    from .pytorch.qwen_vl import QwenVLChatModel
-    from .pytorch.vicuna import VicunaPytorchChatModel
-    from .pytorch.yi_vl import YiVLChatModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
-    from .vllm.core import VLLMChatModel, VLLMModel
+    from .transformers.chatglm import ChatglmPytorchChatModel
+    from .transformers.cogvlm2 import CogVLM2Model
+    from .transformers.cogvlm2_video import CogVLM2VideoModel
+    from .transformers.core import PytorchChatModel, PytorchModel
+    from .transformers.deepseek_vl import DeepSeekVLChatModel
+    from .transformers.glm4v import Glm4VModel
+    from .transformers.intern_vl import InternVLChatModel
+    from .transformers.internlm2 import Internlm2PytorchChatModel
+    from .transformers.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
+    from .transformers.minicpmv25 import MiniCPMV25Model
+    from .transformers.minicpmv26 import MiniCPMV26Model
+    from .transformers.qwen_vl import QwenVLChatModel
+    from .transformers.yi_vl import YiVLChatModel
+    from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
     try:
-        from .pytorch.omnilmm import OmniLMMModel
+        from .transformers.omnilmm import OmniLMMModel
     except ImportError as e:
         # For quite old transformers version,
         # import will generate error
@@ -148,18 +148,15 @@ def _install():
         ]
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
-    VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
+    VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel])
+    LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
-            BaichuanPytorchChatModel,
-            VicunaPytorchChatModel,
-            FalconPytorchChatModel,
             ChatglmPytorchChatModel,
             LlamaPytorchModel,
             LlamaPytorchChatModel,
             PytorchChatModel,
-            FalconPytorchModel,
             Internlm2PytorchChatModel,
             QwenVLChatModel,
             YiVLChatModel,
@@ -167,6 +164,7 @@ def _install():
             InternVLChatModel,
             PytorchModel,
             CogVLM2Model,
+            CogVLM2VideoModel,
             MiniCPMV25Model,
             MiniCPMV26Model,
             Glm4VModel,
@@ -181,6 +179,7 @@ def _install():
     SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
     SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
     SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
+    SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"

xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} RENAMED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import datetime
 import logging
 import os
 import time
@@ -104,35 +103,6 @@ class LlamaCppModel(LLM):
         generate_config.pop("lora_name", None)  # type: ignore
         return generate_config
-    def _convert_ggml_to_gguf(self, model_path: str) -> str:
-        from .tools import convert
-        root_dir = os.path.dirname(os.path.dirname(model_path))
-        gguf_dir = os.path.join(
-            root_dir,
-            "{}-ggufv2-{}b".format(
-                self.model_family.model_name, self.model_spec.model_size_in_billions
-            ),
-        )
-        os.makedirs(gguf_dir, exist_ok=True)
-        gguf_path = os.path.join(
-            gguf_dir,
-            "{}.{}.ggufv2".format(self.model_family.model_name, self.quantization),
-        )
-        # trick for validation, use a mark file to make sure the gguf file is converted
-        mark_file = os.path.join(gguf_dir, f"__valid_{self.quantization}")
-        if os.path.exists(mark_file):
-            return gguf_path
-        else:
-            logger.warning(
-                "You are using a model with ggmlv3, "
-                "and it will take some time to convert to ggufv2"
-            )
-            convert(model_path, gguf_path)
-            with open(mark_file, "w") as f:
-                f.write(str(datetime.datetime.now()))
-            return gguf_path
     def load(self):
         try:
             import llama_cpp
@@ -167,9 +137,6 @@ class LlamaCppModel(LLM):
         if os.path.exists(legacy_model_file_path):
             model_path = legacy_model_file_path
-        if self.model_spec.model_format == "ggmlv3":
-            model_path = self._convert_ggml_to_gguf(model_path)
         try:
             self._llm = Llama(
                 model_path=model_path,
@@ -183,7 +150,7 @@ class LlamaCppModel(LLM):
     def match(
         cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
+        if llm_spec.model_format not in ["ggufv2"]:
             return False
         if "qwen" in llm_family.model_name:
             return False
@@ -285,7 +252,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
     def match(
         cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
+        if llm_spec.model_format not in ["ggufv2"]:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl

Potentially problematic release.

xinference 0.14.1.post1py3-none-any.whl → 0.14.3py3-none-any.whl